From 8d05e0dc5a683bcd4a6bf801bfd8458a2f501152 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 29 May 2023 10:27:21 +0000
Subject: [PATCH 01/10] add README

---
 README.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 00000000000000..e69de29bb2d1d6

From 6c20653d3a73d17977e0a8dd8ab12f01ac2bcfc4 Mon Sep 17 00:00:00 2001
From: root <root@bjyz-sys-gpu-kongming0.bjyz.baidu.com>
Date: Tue, 27 Jun 2023 12:07:41 +0000
Subject: [PATCH 02/10] add code base

---
 VERSION                                       |    1 +
 paddlevlp/__init__.py                         |   19 +
 paddlevlp/datasets/__init__.py                |   17 +
 paddlevlp/datasets/caption_dataset.py         |   98 +
 paddlevlp/datasets/coco_caption.py            |   17 +
 paddlevlp/datasets/dataset.py                 |  844 ++++++++
 paddlevlp/examples/blip2/__init__.py          |   13 +
 paddlevlp/examples/blip2/run_predict.py       |   98 +
 .../examples/blip2/run_pretrain_stage2.py     |  271 +++
 paddlevlp/models/__init__.py                  |   16 +
 paddlevlp/models/blip2/__init__.py            |   13 +
 paddlevlp/models/blip2/configuration.py       |  400 ++++
 paddlevlp/models/blip2/modeling.py            | 1925 +++++++++++++++++
 paddlevlp/optimization.py                     |  106 +
 paddlevlp/processors/__init__.py              |   16 +
 paddlevlp/processors/blip_processing.py       |  661 ++++++
 .../processors/image_processing_utils.py      |  553 +++++
 paddlevlp/processors/image_transform_utils.py |  795 +++++++
 paddlevlp/processors/image_utils.py           |  305 +++
 paddlevlp/processors/processing_utils.py      |  538 +++++
 paddlevlp/processors/utils.py                 |   27 +
 paddlevlp/trainer/__init__.py                 |   15 +
 paddlevlp/trainer/trainer.py                  |   15 +
 paddlevlp/utils/__init__.py                   |   13 +
 paddlevlp/utils/downloader.py                 |  492 +++++
 paddlevlp/utils/env.py                        |   84 +
 paddlevlp/utils/log.py                        |  123 ++
 requirements.txt                              |    2 +
 setup.py                                      |   73 +
 29 files changed, 7550 insertions(+)
 create mode 100644 VERSION
 create mode 100644 paddlevlp/__init__.py
 create mode 100644 paddlevlp/datasets/__init__.py
 create mode 100644 paddlevlp/datasets/caption_dataset.py
 create mode 100644 paddlevlp/datasets/coco_caption.py
 create mode 100644 paddlevlp/datasets/dataset.py
 create mode 100644 paddlevlp/examples/blip2/__init__.py
 create mode 100644 paddlevlp/examples/blip2/run_predict.py
 create mode 100644 paddlevlp/examples/blip2/run_pretrain_stage2.py
 create mode 100644 paddlevlp/models/__init__.py
 create mode 100644 paddlevlp/models/blip2/__init__.py
 create mode 100644 paddlevlp/models/blip2/configuration.py
 create mode 100644 paddlevlp/models/blip2/modeling.py
 create mode 100644 paddlevlp/optimization.py
 create mode 100644 paddlevlp/processors/__init__.py
 create mode 100644 paddlevlp/processors/blip_processing.py
 create mode 100644 paddlevlp/processors/image_processing_utils.py
 create mode 100644 paddlevlp/processors/image_transform_utils.py
 create mode 100644 paddlevlp/processors/image_utils.py
 create mode 100644 paddlevlp/processors/processing_utils.py
 create mode 100644 paddlevlp/processors/utils.py
 create mode 100644 paddlevlp/trainer/__init__.py
 create mode 100644 paddlevlp/trainer/trainer.py
 create mode 100644 paddlevlp/utils/__init__.py
 create mode 100644 paddlevlp/utils/downloader.py
 create mode 100644 paddlevlp/utils/env.py
 create mode 100644 paddlevlp/utils/log.py
 create mode 100644 requirements.txt
 create mode 100644 setup.py

diff --git a/VERSION b/VERSION
new file mode 100644
index 00000000000000..6e8bf73aa550d4
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/paddlevlp/__init__.py b/paddlevlp/__init__.py
new file mode 100644
index 00000000000000..058cfa738ac127
--- /dev/null
+++ b/paddlevlp/__init__.py
@@ -0,0 +1,19 @@
+# copyright (c) 2023 paddlepaddle authors. all rights reserved.
+# copyright 2023 the salesforce team authors and the huggingface team. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from .datasets import *
+from .models import *
+from .optimization import *
+from .processors import *
diff --git a/paddlevlp/datasets/__init__.py b/paddlevlp/datasets/__init__.py
new file mode 100644
index 00000000000000..22899151a584d9
--- /dev/null
+++ b/paddlevlp/datasets/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .caption_dataset import *
+from .coco_caption import *
+from .dataset import *
diff --git a/paddlevlp/datasets/caption_dataset.py b/paddlevlp/datasets/caption_dataset.py
new file mode 100644
index 00000000000000..5ef67626f95029
--- /dev/null
+++ b/paddlevlp/datasets/caption_dataset.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddlevlp.utils.env import DATA_HOME
+from paddlevlp.utils.log import logger
+
+from .dataset import DatasetBuilder
+
+# from paddle.dataset.common import md5file
+# from paddle.utils.download import get_path_from_url
+
+
+__all__ = ["CaptionDataset"]
+
+
+class CaptionDataset(DatasetBuilder):
+    """
+    Caption dataset.
+    """
+
+    URL = "https://bj.bcebos.com/paddlemix/datasets/coco.tar.gz"
+    META_INFO = collections.namedtuple(
+        "META_INFO", ("images", "annotations", "images_md5", "annotations_md5")
+    )
+    MD5 = ""
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("coco", "images"),
+            os.path.join("coco", "annotations/coco_karpathy_train_debug.json"),
+            "",
+            "aa31ac474cf6250ebb81d18348a07ed8",
+        ),
+        "val": META_INFO(
+            os.path.join("coco", "images"),
+            os.path.join("coco", "annotations/coco_karpathy_val.json"),
+            "",
+            "b273847456ef5580e33713b1f7de52a0",
+        ),
+        "test": META_INFO(
+            os.path.join("coco", "images"),
+            os.path.join("coco", "annotations/coco_karpathy_test.json"),
+            "",
+            "3ff34b0ef2db02d01c37399f6a2a6cd1",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        # default_root = '/paddle/wangguanzhong/blip-jinman/PaddleNLP/blip2'
+        logger.info("default dataset root is {}".format(DATA_HOME))
+        images, annotations, image_hash, anno_hash = self.SPLITS[mode]
+        image_fullname = os.path.join(DATA_HOME, images)
+        anno_fullname = os.path.join(DATA_HOME, annotations)
+        # if (
+        #     (not os.path.exists(src_fullname) or (src_data_hash and not md5file(src_fullname) == src_data_hash))
+        #     or (not os.path.exists(tgt_fullname) or (tgt_data_hash and not md5file(tgt_fullname) == tgt_data_hash))
+        #     or (not os.path.exists(vocab_fullname) or (vocab_hash and not md5file(vocab_fullname) == vocab_hash))
+        # ):
+        #     get_path_from_url(self.URL, default_root, self.MD5)
+
+        return image_fullname, anno_fullname, mode
+
+    def _gen_image_id(self, anno):
+        img_ids = {}
+        n = 0
+        for ann in anno:
+            img_id = ann["image_id"]
+            if img_id not in img_ids.keys():
+                img_ids[img_id] = n
+                n += 1
+        return img_ids
+
+    def _read(self, filename, *args):
+        image_root, anno_path, mode = filename
+        annotations = json.load(open(anno_path, "r"))
+        image_ids = self._gen_image_id(annotations)
+
+        for ann in annotations:
+            image_path = os.path.join(image_root, ann["image"])
+            yield_data = {"image": image_path, "image_id": image_ids[ann["image_id"]]}
+            if mode == "train":
+                # only train mode has text input
+                yield_data["text_input"] = ann["caption"]
+            yield yield_data
diff --git a/paddlevlp/datasets/coco_caption.py b/paddlevlp/datasets/coco_caption.py
new file mode 100644
index 00000000000000..bee4ae6c15e79e
--- /dev/null
+++ b/paddlevlp/datasets/coco_caption.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlevlp.datasets.caption_dataset import CaptionDataset
+
+COCOCaption = CaptionDataset
diff --git a/paddlevlp/datasets/dataset.py b/paddlevlp/datasets/dataset.py
new file mode 100644
index 00000000000000..d62d8ed0b85ffa
--- /dev/null
+++ b/paddlevlp/datasets/dataset.py
@@ -0,0 +1,844 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import atexit
+import inspect
+import os
+import time
+import warnings
+from collections import namedtuple
+
+import datasets
+from multiprocess import Pool, RLock
+
+import paddlevlp
+
+try:
+    import paddle.distributed as dist
+except Exception:
+    warnings.warn("paddle.distributed is not contains in you paddle!")
+
+import importlib
+from functools import partial
+
+from paddle.io import Dataset, IterableDataset
+from paddle.utils.download import _get_unique_endpoints
+
+from paddlevlp.utils.env import DATA_HOME
+
+__all__ = ["MapDataset", "DatasetBuilder", "IterDataset", "load_dataset"]
+
+DATASETS_MODULE_PATH = "paddlevlp.datasets."
+
+# Patch for intranet
+from datasets import load_dataset as origin_load_dataset  # noqa: E402
+
+
+def load_from_ppvlp(path, *args, **kwargs):
+    ppvlp_path = paddlevlp.datasets.__path__[0]
+    new_path = os.path.split(path)[-1]
+    new_path = os.path.join(ppvlp_path, "hf_datasets", new_path + ".py")
+    if os.path.exists(new_path):
+        return origin_load_dataset(new_path, *args, **kwargs)
+    else:
+        return origin_load_dataset(path, *args, **kwargs)
+
+
+datasets.load_dataset = load_from_ppvlp
+
+
+class DatasetTuple:
+    def __init__(self, splits):
+        self.identifier_map, identifiers = self._gen_identifier_map(splits)
+        self.tuple_cls = namedtuple("datasets", identifiers)
+        self.tuple = self.tuple_cls(*[None for _ in splits])
+
+    def __getitem__(self, key):
+        if isinstance(key, (int, slice)):
+            return self.tuple[key]
+        if isinstance(key, str):
+            return getattr(self.tuple, self.identifier_map[key])
+
+    def __setitem__(self, key, value):
+        self.tuple = self.tuple._replace(**{self.identifier_map[key]: value})
+
+    def _gen_identifier_map(self, splits):
+        identifier_map = {}
+        identifiers = []
+        for i in range(len(splits)):
+            identifiers.append("splits_" + str(i))
+            identifier_map[splits[i]] = "splits_" + str(i)
+        return identifier_map, identifiers
+
+    def __len__(self):
+        return len(self.tuple)
+
+
+def import_main_class(module_path):
+    """
+    Import a module at module_path and return its DatasetBuilder class.
+
+    """
+    module_path = DATASETS_MODULE_PATH + module_path
+    module = importlib.import_module(module_path)
+    main_cls_type = DatasetBuilder
+
+    # Find the main class in our imported module
+    module_main_cls = None
+    for name, obj in module.__dict__.items():
+        if isinstance(obj, type) and issubclass(obj, main_cls_type):
+            if name == "DatasetBuilder":
+                continue
+            module_main_cls = obj
+            break
+
+    return module_main_cls
+
+
+def load_from_hf(path, name=None, splits=None, **kwargs):
+    from datasets import DatasetDict
+    from datasets import load_dataset as load_hf_dataset
+    from datasets.features import ClassLabel
+
+    try:
+        hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs)
+    except FileNotFoundError:
+        raise FileNotFoundError(
+            "Couldn't find the dataset script for '"
+            + path
+            + "' on PaddleNLP or HuggingFace"
+        )
+    else:
+        label_list = []
+        if isinstance(hf_datasets, DatasetDict):
+            datasets = DatasetTuple(list(hf_datasets.keys()))
+            for split, ds in hf_datasets.items():
+                for feature in ds.features.values():
+                    if isinstance(feature, ClassLabel):
+                        label_list = feature.names
+                datasets[split] = MapDataset(ds, label_list=label_list)
+        elif isinstance(hf_datasets, list):
+            datasets = DatasetTuple(splits)
+            for i, split in enumerate(splits):
+                for feature in hf_datasets[i].features.values():
+                    if isinstance(feature, ClassLabel):
+                        label_list = feature.names
+                datasets[split] = MapDataset(hf_datasets[i], label_list=label_list)
+        else:
+            for feature in hf_datasets.features.values():
+                if isinstance(feature, ClassLabel):
+                    label_list = feature.names
+            datasets = MapDataset(hf_datasets, label_list=label_list)
+    return datasets
+
+
+def load_dataset(
+    path_or_read_func, name=None, data_files=None, splits=None, lazy=None, **kwargs
+):
+    """
+    This method will load a dataset, either form PaddleNLP library or from a
+    self-defined data loading script, by calling functions in `DatasetBuilder`.
+
+    For all the names of datasets in PaddleNLP library, see here:  `dataset_list
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_list.html>`__.
+
+    Either `splits` or `data_files` must be specified.
+
+    Args:
+        path_or_read_func (str|callable): Name of the dataset processing script
+            in PaddleNLP library or a custom data reading function.
+        name (str, optional): Additional name to select a more specific dataset.
+            Defaults to None.
+        data_files (str|list|tuple|dict, optional): Defining the path of dataset
+            files. If None. `splits` must be specified. Defaults to None.
+        splits (str|list|tuple, optional): Which split of the data to load. If None.
+            `data_files` must be specified. Defaults to None.
+        lazy (bool, optional): Weather to return `MapDataset` or an `IterDataset`.
+            True for `IterDataset`. False for `MapDataset`. If None, return the
+            default type of this dataset. Defaults to None.
+        kwargs (dict): Other keyword arguments to be passed to the `DatasetBuilder`.
+
+    Returns:
+        A `MapDataset` or `IterDataset` or a tuple of those.
+
+    For how to use this function, please see `dataset_load
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_load.html>`__
+    and `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__
+
+    """
+    if inspect.isfunction(path_or_read_func):
+        assert lazy is not None, "lazy can not be None in custom mode."
+        kwargs["name"] = name
+        kwargs["data_files"] = data_files
+        kwargs["splits"] = splits
+        custom_kwargs = {}
+        for name in inspect.signature(path_or_read_func).parameters.keys():
+            if name in kwargs.keys():
+                custom_kwargs[name] = kwargs[name]
+
+        reader_instance = SimpleBuilder(lazy=lazy, read_func=path_or_read_func)
+        return reader_instance.read(**custom_kwargs)
+    else:
+        try:
+            reader_cls = import_main_class(path_or_read_func)
+        except ModuleNotFoundError:
+            datasets = load_from_hf(
+                path_or_read_func, name=name, splits=splits, **kwargs
+            )
+        else:
+            reader_instance = reader_cls(lazy=lazy, name=name, **kwargs)
+
+            # Check if selected name and split is valid in this DatasetBuilder
+            if hasattr(reader_instance, "BUILDER_CONFIGS"):
+                if name in reader_cls.BUILDER_CONFIGS.keys():
+                    split_names = reader_cls.BUILDER_CONFIGS[name]["splits"].keys()
+                else:
+                    raise ValueError(
+                        'Invalid name "{}". Should be one of {}.'.format(
+                            name, list(reader_cls.BUILDER_CONFIGS.keys())
+                        )
+                    )
+            elif hasattr(reader_instance, "SPLITS"):
+                split_names = reader_instance.SPLITS.keys()
+            else:
+                raise AttributeError(
+                    "Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder."
+                )
+
+            selected_splits = []
+            if isinstance(splits, list) or isinstance(splits, tuple):
+                selected_splits.extend(splits)
+            else:
+                selected_splits += [splits]
+
+            for split_name in selected_splits:
+                if split_name not in split_names and split_name is not None:
+                    raise ValueError(
+                        'Invalid split "{}". Should be one of {}.'.format(
+                            split_name, list(split_names)
+                        )
+                    )
+
+            datasets = reader_instance.read_datasets(
+                data_files=data_files, splits=splits
+            )
+        return datasets
+
+
+class MapDataset(Dataset):
+    """
+    Wraps a map-style dataset-like object as an instance of `MapDataset`, and equips it
+    with `map` and other utility methods. All non-magic methods of the raw object
+    are also accessible.
+
+    Args:
+        data (list|Dataset): An object with `__getitem__` and `__len__` methods. It could
+            be a list or a subclass of `paddle.io.Dataset`.
+        kwargs (dict, optional): Other information to be passed to the dataset.
+
+    For examples of this class, please see `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__.
+
+    """
+
+    def __init__(self, data, **kwargs):
+        self.data = data
+        self._transform_pipline = []
+        self.new_data = self.data
+        self.info = kwargs
+        self.label_list = self.info.pop("label_list", None)
+        self.vocab_info = self.info.pop("vocab_info", None)
+
+    def _transform(self, data):
+        for fn in self._transform_pipline:
+            data = fn(data)
+        return data
+
+    def __getitem__(self, idx):
+        """
+        Basic function of `MapDataset` to get sample from dataset with a given
+        index.
+        """
+        return (
+            self._transform(self.new_data[idx])
+            if self._transform_pipline
+            else self.new_data[idx]
+        )
+
+    def __len__(self):
+        """
+        Returns the number of samples in dataset.
+        """
+        return len(self.new_data)
+
+    def filter(self, fn, num_workers=0):
+        """
+        Filters samples by the filter function and uses the filtered data to
+        update this dataset.
+
+        Args:
+            fn (callable): A filter function that takes a sample as input and
+                returns a boolean. Samples that return False would be discarded.
+            num_workers(int, optional): Number of processes for multiprocessing. If
+                set to 0, it doesn't use multiprocessing. Defaults to `0`.
+        """
+        assert num_workers >= 0, "num_workers should be a non-negative value"
+        if num_workers > 1:
+            shards = [
+                self._shard(num_shards=num_workers, index=index, contiguous=True)
+                for index in range(num_workers)
+            ]
+            kwds_per_shard = [
+                dict(self=shards[rank], fn=fn) for rank in range(num_workers)
+            ]
+            pool = Pool(num_workers, initargs=(RLock(),))
+
+            results = [
+                pool.apply_async(self.__class__._filter, kwds=kwds)
+                for kwds in kwds_per_shard
+            ]
+            transformed_shards = [r.get() for r in results]
+
+            pool.close()
+            pool.join()
+            self.new_data = []
+            for i in range(num_workers):
+                self.new_data += transformed_shards[i].new_data
+            return self
+        else:
+            return self._filter(fn)
+
+    def _filter(self, fn):
+        self.new_data = [
+            self.new_data[idx]
+            for idx in range(len(self.new_data))
+            if fn(self.new_data[idx])
+        ]
+        return self
+
+    def shard(self, num_shards=None, index=None, contiguous=False):
+        self.new_data = self._shard(
+            num_shards=num_shards, index=index, contiguous=contiguous
+        ).data
+        return self
+
+    def _shard(self, num_shards=None, index=None, contiguous=False):
+        """
+        Split the dataset into `num_shards` pieces. Note that the size of each
+        shard might be different because the original dataset may not be evenly
+        divisible.
+
+        Args:
+            num_shards (int, optional): An integer representing the number of
+                data shards. If None, `num_shards` would be number of trainers.
+                Defaults to `None`.
+            index (int, optional): An integer representing the index of the
+                current shard. If None, `index` would be the current trainer rank
+                id. Defaults to `None`.
+            contiguous: (bool, optional): If true, contiguous chunks of data
+                will be select for sharding. And total number of examples will
+                be the same. Otherwise each shard will contain all examples of
+                dataset whose index mod `num_shards` = `index`. Defaults to `False`.
+        """
+        if num_shards is None:
+            num_shards = dist.get_world_size()
+        if index is None:
+            index = dist.get_rank()
+
+        if contiguous:
+            div = len(self) // num_shards
+            mod = len(self) % num_shards
+            start = div * index + min(index, mod)
+            end = start + div + (1 if index < mod else 0)
+            new_data = [self.new_data[idx] for idx in range(start, end)]
+        else:
+            new_data = [
+                self.new_data[idx]
+                for idx in range(len(self.new_data))
+                if idx % num_shards == index
+            ]
+
+        return MapDataset(new_data)
+
+    def map(self, fn, lazy=True, batched=False, num_workers=0):
+        """
+        Performs specific function on the dataset to transform and update every sample.
+
+        Args:
+            fn (callable): Transformations to be performed. It receives single
+                sample as argument if batched is False. Else it receives all examples.
+            lazy (bool, optional): If True, transformations would be delayed and
+                performed on demand. Otherwise, transforms all samples at once. Note that
+                if `fn` is stochastic, `lazy` should be True or you will get the same
+                result on all epochs. Defaults to False.
+            batched(bool, optional): If True, transformations would take all examples as
+                input and return a collection of transformed examples. Note that if set
+                True, `lazy` option would be ignored. Defaults to False.
+            num_workers(int, optional): Number of processes for multiprocessing. If
+                set to 0, it doesn't use multiprocessing. Note that if set to positive
+                value, `lazy` option would be ignored. Defaults to 0.
+        """
+
+        assert num_workers >= 0, "num_workers should be a non-negative value"
+        if num_workers > 1:
+            shards = [
+                self._shard(num_shards=num_workers, index=index, contiguous=True)
+                for index in range(num_workers)
+            ]
+            kwds_per_shard = [
+                dict(self=shards[rank], fn=fn, lazy=False, batched=batched)
+                for rank in range(num_workers)
+            ]
+            pool = Pool(num_workers, initargs=(RLock(),))
+            results = [
+                pool.apply_async(self.__class__._map, kwds=kwds)
+                for kwds in kwds_per_shard
+            ]
+            transformed_shards = [r.get() for r in results]
+            pool.close()
+            pool.join()
+            self.new_data = []
+            for i in range(num_workers):
+                self.new_data += transformed_shards[i].new_data
+            return self
+        else:
+            return self._map(fn, lazy=lazy, batched=batched)
+
+    def _map(self, fn, lazy=True, batched=False):
+        if batched:
+            self.new_data = fn(self.new_data)
+        elif lazy:
+            self._transform_pipline.append(fn)
+        else:
+            self.new_data = [
+                fn(self.new_data[idx]) for idx in range(len(self.new_data))
+            ]
+        return self
+
+
+class IterDataset(IterableDataset):
+    """
+    Wraps a dataset-like object as an instance of `IterDataset`, and equips it with
+    `map` and other utility methods. All non-magic methods of the raw object
+    also accessible.
+
+    Args:
+        data (Iterable): An object with `__iter__` function. It can be a Iterable or a
+            subclass of `paddle.io.IterableDataset`.
+        kwargs (dict, optional): Other information to be passed to the dataset.
+
+    For examples of this class, please see `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__.
+    """
+
+    def __init__(self, data, **kwargs):
+        self.data = data
+        self._transform_pipline = []
+        self._filter_pipline = []
+
+        self.label_list = kwargs.pop("label_list", None)
+        self.vocab_info = kwargs.pop("vocab_info", None)
+
+    def _transform(self, data):
+        for fn in self._transform_pipline:
+            data = fn(data)
+        return data
+
+    def _shard_filter(self, num_samples):
+        return True
+
+    def _filter(self, data):
+        for fn in self._filter_pipline:
+            if not fn(data):
+                return False
+        return True
+
+    def __iter__(self):
+        """
+        yields sample sequentially.
+        """
+        num_samples = 0
+        if inspect.isfunction(self.data):
+            for example in self.data():
+                if (
+                    not self._filter_pipline or self._filter(self._filter_pipline)
+                ) and self._shard_filter(num_samples=num_samples):
+                    yield self._transform(
+                        example
+                    ) if self._transform_pipline else example
+                num_samples += 1
+        else:
+            if inspect.isgenerator(self.data):
+                warnings.warn(
+                    "Reciving generator as data source, data can only be iterated once"
+                )
+            for example in self.data:
+                if (
+                    not self._filter_pipline or self._filter(self._filter_pipline)
+                ) and self._shard_filter(num_samples=num_samples):
+                    yield self._transform(
+                        example
+                    ) if self._transform_pipline else example
+                num_samples += 1
+
+    def filter(self, fn):
+        """
+        Filters samples by the filter function and uses the filtered data to
+        update this dataset.
+
+        Args:
+            fn (callable): A filter function that takes a sample as input and
+                returns a boolean. Samples that return False are discarded.
+        """
+
+        self._filter_pipline.append(fn)
+
+        return self
+
+    def shard(self, num_shards=None, index=None):
+        """
+        Split the dataset into `num_shards` pieces.
+
+        Args:
+            num_shards (int, optional): An integer representing the number of
+                data shards. If None, `num_shards` would be number of trainers.
+                Defaults to None.
+            index (int, optional): An integer representing the index of the
+                current shard. If None, `index` would be the current trainer rank
+                id. Defaults to None.
+        """
+        if num_shards is None:
+            num_shards = dist.get_world_size()
+        if index is None:
+            index = dist.get_rank()
+
+        def sharder(num_shards, index, num_samples):
+            if num_samples % num_shards == index:
+                return True
+            else:
+                return False
+
+        fn = partial(sharder, num_shards=num_shards, index=index)
+        self._shard_filter = fn
+        return self
+
+    def map(self, fn):
+        """
+        Performs specific function on the dataset to transform and update every sample.
+
+        Args:
+            fn (callable): Transformations to be performed. It receives single
+                sample as argument.
+        """
+
+        self._transform_pipline.append(fn)
+
+        return self
+
+
+class DatasetBuilder:
+    """
+    A base class for all DatasetBuilder. It provides a `read()` function to turn
+    a data file into a MapDataset or IterDataset.
+
+    `_get_data()` function and `_read()` function should be implemented to download
+    data file and read data file into a `Iterable` of the examples.
+
+    For how to define a custom `DatasetBuilder`, please see `contribute_dataset
+    <https://paddlenlp.readthedocs.io/zh/latest/community/contribute_dataset.html>`__.
+    """
+
+    lazy = False
+
+    def __init__(self, lazy=None, name=None, **config):
+        if lazy is not None:
+            self.lazy = lazy
+        self.name = name
+        self.config = config
+
+    def read_datasets(self, splits=None, data_files=None):
+        def remove_if_exit(filepath):
+            if isinstance(filepath, (list, tuple)):
+                for file in filepath:
+                    try:
+                        os.remove(file)
+                    except OSError:
+                        pass
+            else:
+                try:
+                    os.remove(filepath)
+                except OSError:
+                    pass
+
+        if data_files is None:
+            if splits is None:
+                splits = (
+                    list(self.BUILDER_CONFIGS[self.name]["splits"].keys())
+                    if hasattr(self, "BUILDER_CONFIGS")
+                    else list(self.SPLITS.keys())
+                )
+
+            assert (
+                isinstance(splits, str)
+                or (isinstance(splits, list) and isinstance(splits[0], str))
+                or (isinstance(splits, tuple) and isinstance(splits[0], str))
+            ), "`splits` should be a string or list of string or a tuple of string."
+
+            if isinstance(splits, str):
+                splits = [splits]
+            datasets = DatasetTuple(splits)
+            parallel_env = dist.ParallelEnv()
+            unique_endpoints = _get_unique_endpoints(parallel_env.trainer_endpoints[:])
+            # move register hook to first and register togather
+            lock_files = []
+            for split in splits:
+                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
+                if self.name is not None:
+                    lock_file = lock_file + "." + self.name
+                lock_file += "." + split + ".done" + "." + str(os.getppid())
+                lock_files.append(lock_file)
+            # Must register to all procs to make the lock file can be removed
+            # when any proc breaks. Otherwise, the single registered proc may
+            # not receive proper singal send by the parent proc to exit.
+            atexit.register(lambda: remove_if_exit(lock_files))
+            for split in splits:
+                filename = self._get_data(split)
+                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
+                if self.name is not None:
+                    lock_file = lock_file + "." + self.name
+                lock_file += "." + split + ".done" + "." + str(os.getppid())
+                # `lock_file` indicates the finished status of`_get_data`.
+                # `_get_data` only works in the `unique_endpoints` specified
+                # proc since `get_path_from_url` only work for it. The other
+                # procs wait `_get_data` to be finished.
+                if parallel_env.current_endpoint in unique_endpoints:
+                    f = open(lock_file, "w")
+                    f.close()
+                else:
+                    while not os.path.exists(lock_file):
+                        time.sleep(1)
+                datasets[split] = self.read(filename=filename, split=split)
+        else:
+            assert (
+                isinstance(data_files, str)
+                or isinstance(data_files, tuple)
+                or isinstance(data_files, list)
+            ), "`data_files` should be a string or tuple or list of strings."
+            if isinstance(data_files, str):
+                data_files = [data_files]
+            default_split = "train"
+            if splits:
+                if isinstance(splits, str):
+                    splits = [splits]
+                datasets = DatasetTuple(splits)
+                assert len(splits) == len(
+                    data_files
+                ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file."
+                for i in range(len(data_files)):
+                    datasets[splits[i]] = self.read(
+                        filename=data_files[i], split=splits[i]
+                    )
+            else:
+                datasets = DatasetTuple(
+                    ["split" + str(i) for i in range(len(data_files))]
+                )
+                for i in range(len(data_files)):
+                    datasets["split" + str(i)] = self.read(
+                        filename=data_files[i], split=default_split
+                    )
+
+        return datasets if len(datasets) > 1 else datasets[0]
+
+    def read(self, filename, split="train"):
+        """
+        Returns a dataset containing all the examples that can be read from the file path.
+
+        If `self.lazy` is False, this eagerly reads all instances from `self._read()`
+        and returns a `MapDataset`.
+
+        If `self.lazy` is True, this returns an `IterDataset`, which internally
+        relies on the generator created from `self._read()` to lazily produce examples.
+        In this case your implementation of `_read()` must also be lazy
+        (that is, not load all examples into memory at once).
+
+        Args:
+            filename (str): Path of data file to read, usually provided by `_get_data`
+                function.
+            split (str, optional): The split name of selected dataset. This only makes
+                a different when data files of different splits have different structures.
+
+        Returns:
+            A `MapDataset|IterDataset`.
+        """
+
+        label_list = self.get_labels()
+        vocab_info = self.get_vocab()
+
+        def _create_dict(labels):
+            # For multiple labels in the form of list.
+            if isinstance(labels[0], list) or isinstance(labels[0], tuple):
+                label_dict = []
+                for sub_labels in labels:
+                    sub_dict = {}
+                    for i, label in enumerate(sub_labels):
+                        sub_dict[label] = i
+                    label_dict.append(sub_dict)
+            else:
+                label_dict = {}
+                for i, label in enumerate(labels):
+                    label_dict[label] = i
+            return label_dict
+
+        def _convert_label_to_id(labels, label_dict):
+            if isinstance(labels, list) or isinstance(labels, tuple):
+                for label_idx in range(len(labels)):
+                    labels[label_idx] = label_dict[labels[label_idx]]
+            else:
+                labels = label_dict[labels]
+            return labels
+
+        if self.lazy:
+
+            def generate_examples():
+                generator = (
+                    self._read(filename, split)
+                    if self._read.__code__.co_argcount > 2
+                    else self._read(filename)
+                )
+                for example in generator:
+                    # We need to check if the example contains label column and confirm its name.
+                    # For now we only allow `label` or `labels` to be the name of label column.
+                    if "labels" in example.keys():
+                        label_col = "labels"
+                    elif "label" in example.keys():
+                        label_col = "label"
+                    else:
+                        label_col = None
+
+                    # Convert class label to label ids.
+                    if label_list is not None and example.get(label_col, None):
+                        label_dict = _create_dict(label_list)
+                        # For multiple labels in the form of list.
+                        if isinstance(label_dict, list):
+                            for idx, sub_dict in enumerate(label_dict):
+                                example[label_col][idx] = _convert_label_to_id(
+                                    example[label_col][idx], sub_dict
+                                )
+                        else:
+                            example[label_col] = _convert_label_to_id(
+                                example[label_col], label_dict
+                            )
+
+                        yield example
+                    else:
+                        yield example
+
+            return IterDataset(
+                generate_examples(), label_list=label_list, vocab_info=vocab_info
+            )
+        else:
+            examples = (
+                self._read(filename, split)
+                if self._read.__code__.co_argcount > 2
+                else self._read(filename)
+            )
+
+            # Then some validation.
+            if not isinstance(examples, list):
+                examples = list(examples)
+
+            if not examples:
+                raise ValueError(
+                    "No instances were read from the given filepath {}. "
+                    "Is the path correct?".format(filename)
+                )
+
+            # We need to check if the example contains label column and confirm its name.
+            # For now we only allow `label` or `labels` to be the name of label column.
+            if "labels" in examples[0].keys():
+                label_col = "labels"
+            elif "label" in examples[0].keys():
+                label_col = "label"
+            else:
+                label_col = None
+
+            # Convert class label to label ids.
+            if label_list is not None and examples[0].get(label_col, None):
+                label_dict = _create_dict(label_list)
+                for idx in range(len(examples)):
+                    # For multiple labels in the form of list.
+                    if isinstance(label_dict, list):
+                        for i, sub_dict in enumerate(label_dict):
+                            examples[idx][label_col][i] = _convert_label_to_id(
+                                examples[idx][label_col][i], sub_dict
+                            )
+                    else:
+                        examples[idx][label_col] = _convert_label_to_id(
+                            examples[idx][label_col], label_dict
+                        )
+
+            return MapDataset(examples, label_list=label_list, vocab_info=vocab_info)
+
+    def _read(self, filename: str, *args):
+        """
+        Reads examples from the given file_path and returns them as an
+        `Iterable` (which could be a list or a generator).
+
+        This method must be implemented in self-defined `DatasetBuilder`.
+        """
+        raise NotImplementedError
+
+    def _get_data(self, mode: str):
+        """
+        Downloads examples from the given URL and customized split
+        informations and returns a filepath.
+
+        This method must be implemented in self-defined `DatasetBuilder`.
+        """
+        raise NotImplementedError
+
+    def get_labels(self):
+        """
+        Returns list of class labels of the dataset if specified.
+        """
+        return None
+
+    def get_vocab(self):
+        """
+        Returns vocab file path of the dataset if specified.
+        """
+        return None
+
+
+class SimpleBuilder(DatasetBuilder):
+    def __init__(self, lazy, read_func):
+        self._read = read_func
+        self.lazy = lazy
+
+    def read(self, **kwargs):
+        if self.lazy:
+
+            def generate_examples():
+                generator = self._read(**kwargs)
+                for example in generator:
+                    yield example
+
+            return IterDataset(generate_examples)
+        else:
+            examples = self._read(**kwargs)
+            if hasattr(examples, "__len__") and hasattr(examples, "__getitem__"):
+                return MapDataset(examples)
+            else:
+                return MapDataset(list(examples))
diff --git a/paddlevlp/examples/blip2/__init__.py b/paddlevlp/examples/blip2/__init__.py
new file mode 100644
index 00000000000000..595add0aed9e11
--- /dev/null
+++ b/paddlevlp/examples/blip2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlevlp/examples/blip2/run_predict.py b/paddlevlp/examples/blip2/run_predict.py
new file mode 100644
index 00000000000000..6c90dd87df984e
--- /dev/null
+++ b/paddlevlp/examples/blip2/run_predict.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+import paddle
+import requests
+from paddlenlp.trainer import PdArgumentParser
+from PIL import Image
+
+from paddlevlp.models.blip2.modeling import Blip2ForConditionalGeneration
+from paddlevlp.processors.blip_processing import Blip2Processor
+from paddlevlp.utils.log import logger
+
+
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    input_image: str = field(
+        metadata={"help": "The name of input image."}
+    )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
+    prompt: str = field(
+        default=None, metadata={"help": "The prompt of the image to be generated."}
+    )  # "Question: how many cats are there? Answer:"
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default="Salesforce/blip2-opt-2.7b",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    pretrained_model_path: str = field(
+        default=None,
+        metadata={
+            "help": "The path to pre-trained model that we will use for inference."
+        },
+    )
+
+
+def main():
+    parser = PdArgumentParser((ModelArguments, DataArguments))
+    model_args, data_args = parser.parse_args_into_dataclasses()
+    url = (
+        data_args.input_image
+    )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    prompt = data_args.prompt
+    processor = Blip2Processor.from_pretrained(
+        model_args.model_name_or_path
+    )  # "Salesforce/blip2-opt-2.7b"
+    inputs = processor(
+        images=image,
+        text=prompt,
+        return_tensors="pd",
+        return_attention_mask=True,
+        mode="test",
+    )
+    model = Blip2ForConditionalGeneration.from_pretrained(model_args.model_name_or_path)
+
+    # load checkpoint
+    if model_args.pretrained_model_path:
+        weight = paddle.load(model_args.pretrained_model_path)
+        model.set_state_dict(weight)
+
+    model.eval()
+    model.to("gpu")  # doctest: +IGNORE_RESULT
+    generated_ids, scores = model.generate(**inputs)
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
+        0
+    ].strip()
+    logger.info("Generate text: {}".format(generated_text))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlevlp/examples/blip2/run_pretrain_stage2.py b/paddlevlp/examples/blip2/run_pretrain_stage2.py
new file mode 100644
index 00000000000000..de5cf419326c8c
--- /dev/null
+++ b/paddlevlp/examples/blip2/run_pretrain_stage2.py
@@ -0,0 +1,271 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from dataclasses import dataclass, field
+
+import paddle
+from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
+                               get_last_checkpoint)
+from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config
+
+import paddlevlp
+from paddlevlp.datasets import load_dataset
+from paddlevlp.models.blip2.configuration import (Blip2Config,
+                                                  Blip2QFormerConfig,
+                                                  Blip2VisionConfig)
+from paddlevlp.models.blip2.modeling import Blip2ForConditionalGeneration
+from paddlevlp.optimization import FilterParamsName
+from paddlevlp.processors.blip_processing import Blip2Processor
+from paddlevlp.trainer import Trainer
+from paddlevlp.utils.log import logger
+
+
+class BlipCollator:
+    """
+    Data collator that will dynamically pad the inputs to the longest sequence in the batch.
+
+    Args:
+        processor (`paddlevlp.processors.ProcessorMixin`):
+            The processor used for pre-process the data.
+    """
+
+    def __init__(self, processor):
+        self.processor = processor
+
+    def __call__(self, data_list):
+        images = [sample["image"] for sample in data_list]
+        text = [sample["text_input"] for sample in data_list]
+        batch = self.processor(
+            images=images,
+            text=text,
+            max_length=32,
+            return_tensors="pd",
+            return_attention_mask=True,
+            mode="train",
+        )
+        return batch
+
+
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    task_name: str = field(
+        default="coco_caption",
+        metadata={"help": "The name of the task to use (via the datasets library)."},
+    )
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default="Salesforce/blip2-opt-2.7b",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+
+    text_model_name_or_path: str = field(
+        default="facebook/opt-2.7b",
+        metadata={"help": "The type of text model to use (OPT, T5)."},
+    )
+
+
+@dataclass
+class PreTrainingArguments(TrainingArguments):
+    """
+    Arguments pertaining to what training options we are going to use during pretraining.
+    """
+
+    pretrained_model_path: str = field(
+        default=None,
+        metadata={
+            "help": "The path to pre-trained model that we will use for pretraining."
+        },
+    )
+    weight_decay: float = field(
+        default=0.5, metadata={"help": "Weight decay if we apply some."}
+    )
+    learning_rate: float = field(
+        default=1e-4, metadata={"help": "The initial learning rate."}
+    )
+    num_train_epochs: float = field(
+        default=10.0, metadata={"help": "Total number of training epochs to perform."}
+    )
+    warmup_start_lr: float = field(
+        default=1e-6, metadata={"help": "Initial learning rate of warm up."}
+    )
+    eta_min: float = field(
+        default=1e-5, metadata={"help": "The minimum value of learning rate."}
+    )
+    warmup_steps: int = field(
+        default=2000, metadata={"help": "Number of warmup steps."}
+    )
+    lr_scheduler_name: str = field(
+        default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."}
+    )
+
+
+def create_scheduler(dataset_len, config):
+    lr_sched_func = getattr(paddlevlp.optimization, config.lr_scheduler_name)
+    lr_sched = lr_sched_func(
+        learning_rate=config.learning_rate,
+        epochs=config.num_train_epochs,
+        eta_min=config.eta_min,
+        warmup_steps=config.warmup_steps,
+        warmup_start_lr=config.warmup_start_lr,
+        step_each_epoch=dataset_len,
+    )
+    return lr_sched
+
+
+def create_optimizer_and_scheduler(model, dataset_len, config):
+    lr_sched = create_scheduler(dataset_len, config)
+    param_filter = FilterParamsName()
+    p_wd, p_non_wd = param_filter(model)
+    optimizer = paddle.optimizer.AdamW(
+        parameters=p_wd + p_non_wd,
+        learning_rate=lr_sched,
+        weight_decay=float(config.weight_decay),
+        beta1=config.adam_beta1,
+        beta2=config.adam_beta2,
+        apply_decay_param_fun=param_filter.apply_decay_param_fun,
+    )
+    return optimizer, lr_sched
+
+
+def get_text_config(text_model_name_or_path):
+    if "t5" in text_model_name_or_path:
+        text_config = T5Config.from_pretrained(text_model_name_or_path)
+    elif "opt" in text_model_name_or_path:
+        text_config = OPTConfig.from_pretrained(text_model_name_or_path)
+    else:
+        text_config = AutoConfig.from_pretrained(text_model_name_or_path)
+    return text_config
+
+
+def create_model(config):
+    # blip2_config = Blip2ForConditionalGeneration(onfig.model_name_or_path)
+    vision_config = Blip2VisionConfig.from_pretrained(config.model_name_or_path)
+    qformer_config = Blip2QFormerConfig.from_pretrained(config.model_name_or_path)
+    text_config = get_text_config(config.text_model_name_or_path)
+    blip2_config = Blip2Config.from_vision_qformer_text_configs(
+        vision_config, qformer_config, text_config
+    )
+
+    model = Blip2ForConditionalGeneration(blip2_config)
+    return model
+
+
+def load_pretrained_model(model, pretrained_model_path):
+    if pretrained_model_path is None:
+        return
+
+    if not os.path.exists(pretrained_model_path):
+        ValueError(
+            "Cannot find pretrained model path: {}".format(pretrained_model_path)
+        )
+
+    state_dict = paddle.load(pretrained_model_path)
+    for key in model.state_dict().keys():
+        if key in state_dict.keys():
+            if state_dict[key].shape != model.state_dict()[key].shape:
+                del state_dict[key]
+
+    model.set_state_dict(state_dict)
+
+
+def main():
+    parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Log model and data config
+    training_args.print_config(model_args, "Model")
+    training_args.print_config(data_args, "Data")
+
+    paddle.set_device(training_args.device)
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
+    )
+
+    # Detecting last checkpoint
+    last_checkpoint = None
+    if (
+        os.path.isdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        # if last_checkpoint is None and len(
+        #         os.listdir(training_args.output_dir)) > 1:
+        #     raise ValueError(
+        #         f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+        #         "Use --overwrite_output_dir to overcome.")
+        if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # create dataset
+    processor = Blip2Processor.from_pretrained(model_args.model_name_or_path)
+    blip_collator = BlipCollator(processor)
+    train_dataset = load_dataset(data_args.task_name, splits="train")
+    dataset_len = len(train_dataset)
+
+    # create model
+    model = create_model(model_args)
+    load_pretrained_model(model, training_args.pretrained_model_path)
+
+    # load model for debug
+    # weight = paddle.load('/paddle/wangguanzhong/blip-jinman/PaddleNLP/blip2/blip2_checkout_4_output.pdparams')
+    # model.set_state_dict(weight)
+
+    # create optimizer
+    optimizer, lr_sched = create_optimizer_and_scheduler(
+        model, dataset_len, training_args
+    )
+
+    # create trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=blip_collator,
+        optimizers=(optimizer, lr_sched),
+    )
+
+    # Training
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    if training_args.do_train:
+        trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        trainer.save_state()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlevlp/models/__init__.py b/paddlevlp/models/__init__.py
new file mode 100644
index 00000000000000..904dfbb7a6d3d2
--- /dev/null
+++ b/paddlevlp/models/__init__.py
@@ -0,0 +1,16 @@
+# copyright (c) 2023 paddlepaddle authors. all rights reserved.
+# copyright 2023 the salesforce team authors and the huggingface team. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from .blip2.modeling import *
diff --git a/paddlevlp/models/blip2/__init__.py b/paddlevlp/models/blip2/__init__.py
new file mode 100644
index 00000000000000..595add0aed9e11
--- /dev/null
+++ b/paddlevlp/models/blip2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlevlp/models/blip2/configuration.py b/paddlevlp/models/blip2/configuration.py
new file mode 100644
index 00000000000000..d05ade37c2dcb1
--- /dev/null
+++ b/paddlevlp/models/blip2/configuration.py
@@ -0,0 +1,400 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" BLIP-2 model configuration"""
+import copy
+import os
+from typing import Union
+
+from paddlenlp.transformers import AutoConfig
+from paddlenlp.transformers.auto.modeling import \
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+from paddlenlp.transformers.opt.configuration import OPTConfig
+from paddlenlp.transformers.t5.configuration import T5Config
+from paddlenlp.utils.log import logger
+
+__all__ = [
+    "Blip2VisionConfig",
+    "Blip2QFormerConfig",
+    "Blip2Config",
+]
+
+
+class Blip2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a
+    BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration defaults will yield a similar configuration to that of the BLIP-2
+    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import Blip2VisionConfig, Blip2VisionModel
+    >>> # Initializing a Blip2VisionConfig with Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2VisionConfig()
+    >>> # Initializing a Blip2VisionModel (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "blip_2_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        projection_dim=512,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        # get the vision config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["vision_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class Blip2QFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Blip2QFormerModel`]. It is used to instantiate a
+    BLIP-2 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the BLIP-2
+    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. Configuration objects
+    inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+    Note that [`Blip2QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import Blip2QFormerConfig, Blip2QFormerModel
+    >>> # Initializing a BLIP-2 Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2QFormerConfig()
+    >>> # Initializing a model (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2QFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "blip_2_qformer"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        # get the qformer config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["qformer_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class Blip2Config(PretrainedConfig):
+    r"""
+    [`Blip2Config`] is the configuration class to store the configuration of a [`Blip2ForConditionalGeneration`]. It is
+    used to instantiate a BLIP-2 model according to the specified arguments, defining the vision model, Q-Former model
+    and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to
+    that of the BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Blip2VisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Blip2QFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import (
+    ...     Blip2VisionConfig,
+    ...     Blip2QFormerConfig,
+    ...     OPTConfig,
+    ...     Blip2Config,
+    ...     Blip2ForConditionalGeneration,
+    ... )
+    >>> # Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2Config()
+    >>> # Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig
+    >>> # Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations
+    >>> vision_config = Blip2VisionConfig()
+    >>> qformer_config = Blip2QFormerConfig()
+    >>> text_config = OPTConfig()
+    >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "blip-2"
+    is_composition = True
+
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info(
+                "vision_config is None. initializing the Blip2VisionConfig with default values."
+            )
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info(
+                "qformer_config is None. Initializing the Blip2QFormerConfig with default values."
+            )
+
+        if text_config is None:
+            text_config = {}
+            logger.info(
+                "text_config is None. Initializing the text config with default values (`OPTConfig`)."
+            )
+        self.vision_config = Blip2VisionConfig(**vision_config)
+        self.qformer_config = Blip2QFormerConfig(**qformer_config)
+        text_model_type = (
+            text_config["model_type"] if "model_type" in text_config else "opt"
+        )
+        # self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        if text_model_type == "t5":
+            self.text_config = T5Config(**text_config)
+        elif text_model_type == "opt":
+            self.text_config = OPTConfig(**text_config)
+        else:
+            self.text_config = AutoConfig(**text_config)
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = (
+            self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        )
+        # CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+        # self.use_decoder_only_language_model = self.text_config.model_type in CONFIGURATION_MODEL_MAPPING
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+        self.freeze_vit = True
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: Blip2VisionConfig,
+        qformer_config: Blip2QFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
+        configurations.
+        Returns:
+            [`Blip2Config`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/paddlevlp/models/blip2/modeling.py b/paddlevlp/models/blip2/modeling.py
new file mode 100644
index 00000000000000..7cda33b843ee53
--- /dev/null
+++ b/paddlevlp/models/blip2/modeling.py
@@ -0,0 +1,1925 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Paddle BLIP2 model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+from paddlenlp.transformers.activations import ACT2FN
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput)
+from paddlenlp.transformers.model_utils import (
+    PretrainedModel, apply_chunking_to_forward,
+    find_pruneable_heads_and_indices, prune_linear_layer)
+from paddlenlp.transformers.opt.configuration import OPTConfig
+from paddlenlp.transformers.opt.modeling import OPTForCausalLM
+from paddlenlp.transformers.t5.configuration import T5Config
+from paddlenlp.transformers.t5.modeling import T5ForConditionalGeneration
+from paddlenlp.utils.initializer import normal_, ones_, zeros_
+from paddlenlp.utils.log import logger
+
+from .configuration import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
+
+BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Salesforce/blip2-flan-t5-xl",
+    "Salesforce/blip2-opt-2.7b",
+]
+
+__all__ = [
+    "Blip2QFormerModel",
+    "Blip2Model",
+    "Blip2PretrainedModel",
+    "Blip2VisionModel",
+    "Blip2ForConditionalGeneration",
+]
+
+
+def Parameter(tensor):
+    return paddle.create_parameter(
+        tensor.shape,
+        dtype=tensor.dtype,
+        default_initializer=nn.initializer.Assign(tensor),
+    )
+
+
+@dataclass
+class Blip2ForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`Blip2ForConditionalGeneration`].
+    Args:
+        loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[paddle.Tensor]] = None
+    logits: Optional[Tuple[paddle.Tensor]] = None
+    vision_outputs: Optional[paddle.Tensor] = None
+    qformer_outputs: Optional[Tuple[paddle.Tensor]] = None
+    language_model_outputs: Optional[Tuple[paddle.Tensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from paddlenlp.transformers.blip.modeling.BlipVisionEmbeddings with Blip->Blip2
+class Blip2VisionEmbeddings(nn.Layer):
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = Parameter(
+            paddle.randn([1, 1, self.embed_dim], dtype=paddle.get_default_dtype()),
+        )
+
+        self.patch_embedding = nn.Conv2D(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = Parameter(
+            paddle.randn(
+                [1, self.num_positions, self.embed_dim],
+                dtype=paddle.get_default_dtype(),
+            )
+        )
+
+    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values
+        )  # shape = [*, width, grid, grid]
+        # print('DEBUG!!!!! pixel_values: ', np.abs(pixel_values.numpy()).mean())
+        # print('DEBUG!!!!! patch_embedding weight: ', np.abs(self.patch_embedding.weight.numpy()).mean(), self.patch_embedding.weight.shape)
+        patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(
+            target_dtype
+        )
+        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[
+            :, : embeddings.shape[1], :
+        ].cast(target_dtype)
+        # rint('DEBUG!!!!embeddings: ', np.abs(embeddings.numpy()).mean())
+        return embeddings
+
+
+class Blip2Attention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False)
+
+        if config.qkv_bias:
+            q_bias = Parameter(
+                paddle.zeros([self.embed_dim], dtype=paddle.get_default_dtype())
+            )
+            v_bias = Parameter(
+                paddle.zeros([self.embed_dim], dtype=paddle.get_default_dtype())
+            )
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias))
+            self.qkv.bias = Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose(
+            [0, 2, 1, 3]
+        )
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.shape
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape(
+            [bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]
+        ).transpose([2, 0, 3, 1, 4])
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_states).transpose(
+            [0, 2, 1, 3]
+        )
+
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+# Copied from paddlenlp.transformers.blip.modeling.BlipMLP
+class Blip2MLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.blip.modeling.BlipEncoderLayer with Blip->Blip2
+class Blip2EncoderLayer(nn.Layer):
+    def __init__(self, config: Blip2Config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Blip2Attention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = Blip2MLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        # print('DEBUG!! vit input: ', np.abs(hidden_states.numpy()).mean())
+
+        hidden_states = self.layer_norm1(hidden_states)
+
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # print('DEBUG!! vit layer mlp: ', np.abs(hidden_states.numpy()).mean())
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Blip2PretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Blip2Config
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+        r"language_model.encoder.embed_tokens.weight",
+        r"language_model.decoder.embed_tokens.weight",
+    ]
+    _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
+    _keep_in_fp32_modules = ["wo"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if (
+            isinstance(module, nn.Conv2D)
+            or isinstance(module, nn.Embedding)
+            or isinstance(module, nn.Linear)
+        ):
+            normal_(module.weight, mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                zeros_(module.bias)
+
+        if isinstance(module, Blip2VisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
+            trunc_normal_(module.position_embedding)
+            trunc_normal_(
+                module.class_embedding,
+            )
+
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, Blip2Encoder):
+            module.gradient_checkpointing = value
+
+
+class Blip2Encoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Blip2EncoderLayer`].
+    Args:
+        config (`Blip2Config`):
+            The corresponding vision configuration for the `Blip2Encoder`.
+    """
+
+    def __init__(self, config: Blip2Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList(
+            [Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+class Blip2VisionModel(Blip2PretrainedModel):
+    main_input_name = "pixel_values"
+    config_class = Blip2VisionConfig
+
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Blip2VisionEmbeddings(config)
+        self.encoder = Blip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class Blip2QFormerMultiHeadAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [
+            self.num_attention_heads,
+            self.attention_head_size,
+        ]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.shape[1]
+            position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1])
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.cast(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = paddle.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = paddle.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = paddle.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = paddle.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from paddlenlp.transformers.bert.modeling.BertSelfOutput with Bert->Blip2QFormer
+class Blip2QFormerSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor
+    ) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = Blip2QFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.attention.num_attention_heads,
+            self.attention.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
+            heads
+        )
+        self.attention.all_head_size = (
+            self.attention.attention_head_size * self.attention.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+# Copied from paddlenlp.transformers.bert.modeling.BertIntermediate with Bert->Blip2QFormer
+class Blip2QFormerIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.bert.modeling.BertOutput with Bert->Blip2QFormer
+class Blip2QFormerOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor
+    ) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerLayer(nn.Layer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Blip2QFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = Blip2QFormerIntermediate(config)
+        self.output_query = Blip2QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError(
+                        "encoder_hidden_states must be given for cross-attention layers"
+                    )
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class Blip2QFormerEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList(
+            [
+                Blip2QFormerLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs, past_key_value, output_attentions, query_length
+                        )
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class Blip2QFormerModel(Blip2PretrainedModel):
+    """
+    Querying Transformer (Q-Former), used in BLIP-2.
+    """
+
+    def __init__(self, config: Blip2QFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = Blip2QFormerEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`paddle.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+        Returns:
+            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.cast(
+            dtype=self.config.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(
+        self, encoder_attention_mask: paddle.Tensor
+    ) -> paddle.Tensor:
+        """
+        Invert an attention mask (e.g., switches 0. and 1.).
+        Args:
+            encoder_attention_mask (`paddle.Tensor`): An attention mask.
+        Returns:
+            `paddle.Tensor`: The inverted attention mask.
+        """
+        if encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+        # /transformer/transformer_layers.py#L270
+        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+        # encoder_extended_attention_mask.transpose(-1, -2))
+        encoder_extended_attention_mask = encoder_extended_attention_mask.cast(
+            dtype=self.config.dtype
+        )  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+        return encoder_extended_attention_mask
+
+    def get_head_mask(
+        self,
+        head_mask: Optional[paddle.Tensor],
+        num_hidden_layers: int,
+        is_attention_chunked: bool = False,
+    ) -> paddle.Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.ndim == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
+        elif head_mask.ndim == 2:
+            head_mask = (
+                head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+            )  # We can specify head_mask for each layer
+        assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.cast(
+            dtype=self.config.dtype
+        )  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    def forward(
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length
+            if past_key_values is not None
+            else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.layernorm(
+            query_embeds.cast(self.layernorm.weight.dtype)
+        )
+        embedding_output = self.dropout(embedding_output)
+        input_shape = embedding_output.shape[:-1]
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(
+                ((batch_size, seq_length + past_key_values_length))
+            )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+                    0
+                ].shape
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in encoder_attention_mask
+                ]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+        # print('DEBUG!!!sequence_output', sequence_output.shape, np.abs(sequence_output.numpy()).mean())
+        # print('DEBUG!!!pooled_output', pooled_output.shape, np.abs(pooled_output.numpy()).mean())
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class Blip2Model(Blip2PretrainedModel):
+    config_class = Blip2Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+
+        self.vision_model = Blip2VisionModel(config.vision_config)
+        self.query_tokens = Parameter(
+            paddle.zeros(
+                [1, config.num_query_tokens, config.qformer_config.hidden_size]
+            )
+        )
+        self.qformer = Blip2QFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(
+            config.qformer_config.hidden_size, config.text_config.hidden_size
+        )
+        if config.use_decoder_only_language_model:
+            if isinstance(config.text_config, OPTConfig):
+                language_model = OPTForCausalLM(config.text_config)
+            else:
+                raise NotImplementedError
+        else:
+            if isinstance(config.text_config, T5Config):
+                language_model = T5ForConditionalGeneration(config.text_config)
+            else:
+                raise NotImplementedError
+        self.language_model = language_model
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_ids: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from paddlenlp.transformers import AutoTokenizer, Blip2Model
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt").to(device)
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if self.config.use_decoder_only_language_model:
+            text_outputs = self.language_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+            text_outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+
+        return text_outputs
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import AutoProcessor, Blip2Model
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pd")
+        >>> image_outputs = model.get_image_features(**inputs)
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            last_hidden_state = vision_outputs[0]
+            pooled_output = vision_outputs[1]
+        else:
+            last_hidden_state = vision_outputs.last_hidden_state
+            pooled_output = vision_outputs.pooler_output
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + vision_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    def get_qformer_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import Blip2Processor, Blip2Model
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> qformer_outputs = model.get_qformer_features(**inputs)
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return query_outputs
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_ids: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import Blip2Processor, Blip2Model
+        >>> import paddle
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "Question: how many cats are there? Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pd")
+        >>> outputs = model(pixel_values=inputs["pixel_values"],input_ids=inputs["input_ids"])
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(
+            language_model_inputs.shape[:-1], dtype="int64"
+        )
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
+
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+
+        attention_mask = paddle.concat(
+            [language_model_attention_mask, attention_mask], axis=1
+        )
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                logits = logits[:, -labels.shape[1] :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :]
+                shift_labels = labels[..., 1:]
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+
+                loss = loss_fct(
+                    shift_logits.reshape([-1, self.config.text_config.vocab_size]),
+                    shift_labels.reshape([-1]),
+                )
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+class Blip2ForConditionalGeneration(Blip2PretrainedModel):
+    config_class = Blip2Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+        self.vision_model = Blip2VisionModel(config.vision_config)
+        # self.post_layernorm = nn.LayerNorm(config.vision_config.hidden_size, epsilon=config.vision_config.layer_norm_eps)
+        self.freeze_vit = config.freeze_vit
+        if self.freeze_vit:
+            # freeze vit except the post layer norm layer.
+            for name, param in self.vision_model.named_parameters():
+                if "post_layernorm" not in name:
+                    param.stop_gradient = True
+            self.vision_model.eval()
+            logger.info("freeze vision encoder")
+        self.query_tokens = Parameter(
+            paddle.zeros(
+                [1, config.num_query_tokens, config.qformer_config.hidden_size]
+            )
+        )
+        self.qformer = Blip2QFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(
+            config.qformer_config.hidden_size, config.text_config.hidden_size
+        )
+        if config.use_decoder_only_language_model:
+            # language_model = AutoModelForCausalLM.from_config(config.text_config)
+            if isinstance(config.text_config, OPTConfig):
+                language_model = OPTForCausalLM(config.text_config)
+            else:
+                raise NotImplementedError
+        else:
+            # language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+            if isinstance(config.text_config, T5Config):
+                language_model = T5ForConditionalGeneration(config.text_config)
+            else:
+                raise NotImplementedError
+        self.language_model = language_model
+        for name, param in self.language_model.named_parameters():
+            param.stop_gradient = True
+        self.pad_token_id = config.text_config.pad_token_id
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_ids: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        Image captioning (without providing a text prompt):
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import Blip2Processor, Blip2ForConditionalGeneration
+        >>> import paddle
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2ForConditionalGeneration.from_pretrained(
+        ...     "Salesforce/blip2-flan-t5-xl"
+        ... )
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pd")
+        >>> generated_ids, scores = model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        two cats laying on a couch
+        ```
+        Visual question answering (prompt = question):
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import Blip2Processor, Blip2ForConditionalGeneration
+        >>> import paddle
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2ForConditionalGeneration.from_pretrained(
+        ...     "Salesforce/blip2-flan-t5-xl"
+        ... )
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "Question: how many cats are there? Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pd")
+        >>> generated_ids, scores= model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        two
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        # print('DEBUG!! Blip2ForCond query_tokens: ', query_tokens.shape, np.abs(query_tokens.numpy()).mean())
+        # print('DEBUG!! Blip2ForCond image_embeds: ', image_embeds.shape, np.abs(image_embeds.numpy()).mean())
+        # print('DEBUG!! Blip2ForCond image_attention_mask: ', image_attention_mask.shape, np.abs(image_attention_mask.numpy()).mean())
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        # print('DEBUG!!! Blip2ForCond query_output: ', query_output.shape, query_output)
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(
+            language_model_inputs.shape[:-1], dtype="int64"
+        )
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
+
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+
+        attention_mask = paddle.concat(
+            [language_model_attention_mask, attention_mask], axis=1
+        )
+
+        targets = input_ids * (
+            1 - (input_ids == self.pad_token_id).astype(input_ids.dtype)
+        ) + (input_ids == self.pad_token_id).astype(input_ids.dtype) * (-100)
+
+        empty_targets = paddle.ones(
+            language_model_attention_mask.shape, dtype="int64"
+        ).fill_(-100)
+        labels = paddle.concat([empty_targets, targets], axis=1)
+        labels.stop_gradient = True
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs
+            loss = None
+
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                logits = logits[:, -labels.shape[1] :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :]
+                shift_labels = labels[..., 1:]
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="none")
+                shift_logits = shift_logits.reshape(
+                    [-1, self.config.text_config.vocab_size]
+                )
+                shift_labels = shift_labels.reshape([-1])
+                loss = (
+                    loss_fct(shift_logits, shift_labels).sum()
+                    / (shift_labels > 0).sum()
+                )
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+        # print('DEBUG!!! Blip2ForCond loss: ', loss.shape, loss)
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        batch_size = pixel_values.shape[0]
+        image_embeds = self.vision_model(
+            pixel_values, return_dict=True
+        ).last_hidden_state
+        # print('DEBUG!!! image_embeds: ', image_embeds.shape, ' ', np.abs(image_embeds.numpy()).mean(), image_embeds)
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        language_model_inputs = self.language_projection(query_output)
+        language_attention_mask = paddle.ones(
+            language_model_inputs.shape[:-1], dtype="int64"
+        )
+        if input_ids is None:
+            input_ids = paddle.to_tensor([[self.config.text_config.bos_token_id]]).tile(
+                [batch_size, 1]
+            )
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+        attention_mask = paddle.concat(
+            [language_attention_mask, attention_mask], axis=1
+        )
+        # concatenate query embeddings with prompt embeddings
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # print('DEBUG!! input_ids: ', input_ids.shape,'', np.abs(input_ids.numpy()).mean(), input_ids)
+        # print('DEBUG!! inputs_embeds: ', inputs_embeds.shape,'', np.abs(inputs_embeds.numpy()).mean(), inputs_embeds)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
+        # print('DEBUG!! inputs_embeds concat: ', inputs_embeds.shape,'', np.abs(inputs_embeds.numpy()).mean(), inputs_embeds)
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            decode_strategy="beam_search",
+            length_penalty=1.0,
+            num_beams=5,
+            max_length=30,
+            **generate_kwargs,
+        )
+        return outputs
diff --git a/paddlevlp/optimization.py b/paddlevlp/optimization.py
new file mode 100644
index 00000000000000..9715228f9f4a77
--- /dev/null
+++ b/paddlevlp/optimization.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from paddle.optimizer.lr import LRScheduler
+
+from paddlevlp.utils.log import logger
+
+__all__ = [
+    "CosineDecayWithWarmup",
+    "FilterParamsName",
+]
+
+
+class CosineDecayWithWarmup(LRScheduler):
+    """
+    Cosine learning rate decay
+    lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
+    Args:
+        lr(float): initial learning rate
+        step_each_epoch(int): steps each epoch
+        epochs(int): total training epochs
+        eta_min(float): Minimum learning rate. Default: 0.0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(
+        self,
+        learning_rate,
+        epochs,
+        eta_min=0.0,
+        warmup_steps=0,
+        warmup_start_lr=0.0,
+        last_epoch=-1,
+        step_each_epoch=1,
+        **kwargs
+    ):
+        self.start_lr = learning_rate
+        self.T_max = epochs
+        self.eta_min = eta_min
+        self.last_epoch = last_epoch
+        self.warmup_steps = warmup_steps
+        self.warmup_start_lr = warmup_start_lr
+        self.last_lr = self.start_lr
+        self.cur_step = 0
+        self.last_epoch = last_epoch
+        self.step_each_epoch = step_each_epoch
+        if self.warmup_steps > 0:
+            self.last_lr = self.warmup_start_lr
+        super().__init__(learning_rate=self.last_lr, last_epoch=self.last_epoch)
+
+    def step(self):
+        self.cur_step += 1
+        cur_step_in_epoch = (self.cur_step - 2) % self.step_each_epoch
+        if self.cur_step < self.warmup_steps and self.last_epoch == 0:
+            self.last_lr = self.warmup_start_lr + (
+                self.start_lr - self.warmup_start_lr
+            ) * cur_step_in_epoch / max(self.warmup_steps, 1)
+        else:
+            self.last_lr = (self.start_lr - self.eta_min) * 0.5 * (
+                1.0 + math.cos(math.pi * self.last_epoch / self.T_max)
+            ) + self.eta_min
+        self.last_epoch += 1
+
+    def get_lr(self):
+        return self.last_lr
+
+
+class FilterParamsName(object):
+    """
+    FilterParamsName is a utility class to filter out some params from optimizer.
+    """
+
+    def __init__(self):
+        self.p_non_wd_name = []
+
+    def __call__(self, model):
+        num_parameters = 0
+        p_wd, p_non_wd = [], []
+        for n, p in model.named_parameters():
+            if p.stop_gradient:
+                continue  # frozen weights
+            if p.ndim < 2 or "bias" in n or "norm" in n.lower():
+                p_non_wd.append(p)
+                self.p_non_wd_name.append(n)
+            else:
+                p_wd.append(p)
+            num_parameters += p.numel()
+        logger.info("number of trainable parameters: %d" % num_parameters)
+        return p_wd, p_non_wd
+
+    def apply_decay_param_fun(self, name):
+        return name not in self.p_non_wd_name
diff --git a/paddlevlp/processors/__init__.py b/paddlevlp/processors/__init__.py
new file mode 100644
index 00000000000000..4738e3272555e6
--- /dev/null
+++ b/paddlevlp/processors/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .blip_processing import *
diff --git a/paddlevlp/processors/blip_processing.py b/paddlevlp/processors/blip_processing.py
new file mode 100644
index 00000000000000..916be3b2493ed7
--- /dev/null
+++ b/paddlevlp/processors/blip_processing.py
@@ -0,0 +1,661 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for BLIP-2.
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding,
+                                                         PreTokenizedInput,
+                                                         TensorType, TextInput)
+
+from .base_processing import ProcessorMixin
+from .image_transform_utils import (convert_to_rgb, normalize,
+                                    random_horizontal_flip,
+                                    random_resized_crop, rescale, resize,
+                                    to_channel_dimension_format)
+from .image_utils import (IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD,
+                          ChannelDimension, ImageInput, PILImageResampling,
+                          load_image, to_numpy_array, valid_images)
+from .processing_utils import (BaseImageProcessor, BaseTextProcessor,
+                               get_size_dict)
+
+__all__ = [
+    "Blip2Processor",
+    "BlipImageProcessor",
+    "BlipTextProcessor",
+]
+
+
+class Blip2Processor(ProcessorMixin):
+    r"""
+    Constructs a BLIP-2 processor which wraps a BLIP2 image processor and an OPT/T5 tokenizer into a single processor.
+    [`Blip2Processor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring
+    of [`~Blip2Processor.__call__`] and [`~Blip2Processor.decode`] for more information.
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "text_processor", "tokenizer"]
+    image_processor_class = "BlipImageProcessor"
+    text_processor_class = "BlipTextProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, text_processor, tokenizer):
+        super().__init__(image_processor, text_processor, tokenizer)
+
+    def __call__(
+        self,
+        images=None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        max_length=32,
+        mode="train",
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Bert's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        Blip2ImageProcessor's [`~Blip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+
+            images (`PIL.Image.Image`, `np.ndarray`, `paddle.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[paddle.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or Paddle
+                tensor. In case of a NumPy array/Paddle tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'pd'`: Return Paddle `paddle.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+            max_length (`int`, *optional*):
+                If set to a number, will limit the total sequence returned so
+                that it has a maximum length.
+            mode (`str`, *optional*):
+                The mode of ("train", "val", "test")
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        # Get only text
+        if images is None:
+            text_encoding = self.text_processor(text, mode=mode)
+            text_encoding = self.tokenizer(
+                text=text_encoding,
+                return_tensors=return_tensors,
+                return_token_type_ids=False,
+                max_length=32,
+                padding=True,
+                **kwargs,
+            )
+            return text_encoding
+
+        # add pixel_values
+        encoding_image_processor = self.image_processor(
+            images, return_tensors=return_tensors, mode=mode
+        )
+
+        if text is not None:
+            text_encoding = self.text_processor(text, mode=mode)
+            text_encoding = self.tokenizer(
+                text=text_encoding,
+                return_tensors=return_tensors,
+                return_token_type_ids=False,
+                max_length=max_length,
+                padding=True,
+                **kwargs,
+            )
+        else:
+            text_encoding = None
+            # eos_token_id = None
+
+        if text_encoding is not None:
+            encoding_image_processor.update(text_encoding)
+
+        return encoding_image_processor
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+class BlipTextProcessor(BaseTextProcessor):
+    r"""
+    Constructs a BLIP text processor.
+
+    Args:
+        prompt(`str`, *optional*, defaults to `""`):
+            The prompt (used for generating prompts) that will be prepended to each generated text.
+        do_caption (`bool`, *optional*, defaults to `False`):
+            Whether to do the caption task.
+        do_question(`bool`, *optional*, defaults to `False`):
+            Whether to do the question task.
+        max_words (`int`, *optional*, defaults to `50`):
+            The maximum number of words to keep in the span of text.
+
+    """
+
+    def __init__(
+        self,
+        prompt: str = "",
+        do_caption: bool = False,
+        do_question: bool = False,
+        max_words: int = 50,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if do_question and do_caption:
+            raise ValueError(
+                "do_caption and do_question cannot be set at the same time."
+            )
+        if not do_caption and not do_question:
+            raise ValueError("Either do_caption or do_question must be set to True.")
+        self.prompt = prompt
+        self.do_caption = do_caption
+        self.do_question = do_question
+        self.max_words = max_words
+
+    def __call__(
+        self,
+        text,
+        do_caption: Optional[bool] = None,
+        do_question: Optional[bool] = None,
+        mode: str = "train",
+        **kwargs,
+    ):
+        """
+        Preprocess the text before tokenization.
+
+        Args:
+            text (`str`):
+                Text to preprocess.
+            do_caption (`bool`, *optional*, defaults to `False`):
+                Whether to do the caption task.
+            do_question(`bool`, *optional*, defaults to `False`):
+                Whether to do the question task.
+            mode(`str`, *optional*, defaults to `train`):
+                The mode of ("train", "val", "test")
+
+        """
+        do_caption = do_caption if do_caption is not None else self.do_caption
+        do_question = do_question if do_question is not None else self.do_question
+        if do_caption and do_question:
+            raise ValueError(
+                "do_caption and do_question cannot be set at the same time."
+            )
+        if not do_caption and not do_question:
+            raise ValueError("Either do_caption or do_question must be set to True.")
+
+        if not isinstance(text, (list, tuple)):
+            text = [text]
+        # import pdb; pdb.set_trace()
+        if do_caption:
+            results = [self.prompt + self.pre_caption(t) for t in text]
+        if do_question:
+            results = [self.pre_question(t) for t in text]
+        if mode == "train":
+            results = [res + "\n" for res in results]
+        return results
+
+    def pre_caption(self, caption: str) -> str:
+        """
+        Preprocess the text before tokenization.
+        """
+        caption = re.sub(
+            r"([.!\"()*#:;~])",
+            " ",
+            caption.lower(),
+        )
+        caption = re.sub(
+            r"\s{2,}",
+            " ",
+            caption,
+        )
+        caption = caption.rstrip("\n")
+        caption = caption.strip(" ")
+
+        # truncate caption
+        caption_words = caption.split(" ")
+        if len(caption_words) > self.max_words:
+            caption = " ".join(caption_words[: self.max_words])
+
+        return caption
+
+    def pre_question(self, question: str) -> str:
+        """
+        Preprocess the text before tokenization.
+        """
+        question = re.sub(
+            r"([.!\"()*#:;~])",
+            "",
+            question.lower(),
+        )
+        question = question.rstrip(" ")
+
+        # truncate question
+        question_words = question.split(" ")
+        if len(question_words) > self.max_words:
+            question = " ".join(question_words[: self.max_words])
+
+        return question
+
+
+class BlipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_rand_resize_crop (`bool`, *optional*, defaults to `False`):
+            Whether to *randomly crop* the image at random in the height and width dimensions.
+        rand_resize_crop_prob (`float`, *optional*, defaults to `0.5`):
+            Probability of applying a random crop to the image.
+        scale (`list|tuple`, *optional*, defaults to `(0.08, 1.0)`):
+            Scale range of the cropped image before resizing, relatively to the origin image.
+        mode (`str`, *optional*):
+                The mode of ("train", "val", "test")
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        do_flip: bool = False,
+        flip_prob: float = 0.5,
+        do_rand_resize_crop: bool = False,
+        scale: Optional[Union[List[float], Tuple[float]]] = (0.08, 1.0),
+        do_collate: bool = False,
+        mode: str = "train",
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = (
+            image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        )
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
+        self.do_flip = do_flip
+        self.flip_prob = flip_prob
+        self.do_rand_resize_crop = do_rand_resize_crop
+        self.scale = scale
+        self.do_collate = do_collate
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=True)
+        output_size = (size["width"], size["height"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            **kwargs,
+        )
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean.
+            std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def random_resized_crop(
+        self,
+        image: np.ndarray,
+        size: Union[int, List, Tuple],
+        scale: float,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Crop the input data to random size and aspect ratio.
+        A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+        aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
+        After applying crop transfrom, the input data will be resized to given size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize to and crop.
+            size (Union[int, List, Tuple]):
+                Size of cropped image.
+            scale (`float`):
+                Scale to apply to the image.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+        """
+        size = list(size.values())
+        return random_resized_crop(
+            image, size=size, scale=scale, resample=resample, **kwargs
+        )
+
+    def random_horizontal_flip(
+        self, image: np.ndarray, flip_prob: float, **kwargs
+    ) -> np.ndarray:
+        """
+        Horizontally flip the input data randomly with a given probability.
+
+        Args:
+        image (`np.ndarray`):
+            Image to flip.
+        flip_prob (`float`):
+            Probability of flipping the image.
+        """
+        return random_horizontal_flip(image, flip_prob=flip_prob, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        do_flip: bool = None,
+        flip_prob: float = None,
+        do_rand_resize_crop: bool = None,
+        scale: Optional[Union[List[float], Tuple[float]]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        mode: str = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_rand_resize_crop (`bool`, *optional*, defaults to `False`):
+                Whether to *randomly crop* the image at random in the height and width dimensions.
+            scale (`list|tuple`, *optional*, defaults to `(0.08, 1.0)`):
+                Scale range of the cropped image before resizing, relatively to the origin image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+            mode (`str`, *optional*):
+                The mode of ("train", "val", "test")
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+        do_flip = do_flip if do_flip is not None else self.do_flip
+        flip_prob = flip_prob if flip_prob is not None else self.flip_prob
+        scale = scale if scale is not None else self.scale
+        do_rand_resize_crop = (
+            do_rand_resize_crop
+            if do_rand_resize_crop is not None
+            else self.do_rand_resize_crop
+        )
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        if not isinstance(images, (list, tuple)):
+            images = [images]
+
+        if isinstance(images[0], str):
+            images = [load_image(image) for image in images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "paddle.Tensor."
+            )
+
+        if do_resize and size is None or resample is None:
+            raise ValueError(
+                "Size and resample must be specified if do_resize is True."
+            )
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError(
+                "Image mean and std must be specified if do_normalize is True."
+            )
+
+        if do_flip and flip_prob is None:
+            raise ValueError("Flip probability must be specified if do_flip is True.")
+
+        if do_rand_resize_crop and scale is None:
+            raise ValueError(
+                "Random resize crop probability must be specified if do_rand_resize_crop is True."
+            )
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if do_rand_resize_crop and mode == "train":
+            images = [
+                self.random_resized_crop(
+                    image=image, size=size, scale=scale, resample=resample
+                )
+                for image in images
+            ]
+        elif do_resize and mode != "train":
+            images = [
+                self.resize(image=image, size=size, resample=resample)
+                for image in images
+            ]
+
+        if do_flip and mode == "train":
+            images = [
+                self.random_horizontal_flip(image=image, flip_prob=flip_prob)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor) for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std)
+                for image in images
+            ]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchEncoding(data=data, tensor_type=return_tensors)
diff --git a/paddlevlp/processors/image_processing_utils.py b/paddlevlp/processors/image_processing_utils.py
new file mode 100644
index 00000000000000..e476b8549c8f38
--- /dev/null
+++ b/paddlevlp/processors/image_processing_utils.py
@@ -0,0 +1,553 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import os
+import tempfile
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import numpy as np
+from huggingface_hub import (create_repo, get_hf_file_metadata,
+                             hf_hub_download, hf_hub_url,
+                             repo_type_and_id_from_hf_id, upload_folder)
+from huggingface_hub.utils import EntryNotFoundError
+from paddlenlp import __version__
+from paddlenlp.transformers.feature_extraction_utils import \
+    BatchFeature as BaseBatchFeature
+
+from paddlevlp.utils.downloader import (COMMUNITY_MODEL_PREFIX,
+                                        get_path_from_url_with_filelock,
+                                        resolve_cache_dir)
+from paddlevlp.utils.log import logger
+
+IMAGE_PROCESSOR_NAME = "image_preprocessor_config.json"
+TEXT_PROCESSOR_NAME = "text_processor_config.json"
+
+
+class BatchFeature(BaseBatchFeature):
+    r"""
+    Holds the output of the image processor specific `__call__` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in Paddle/Numpy Tensors at
+            initialization.
+    """
+
+
+class ImageProcessingMixin(object):
+    """
+    This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
+    extractors.
+    """
+
+    _auto_class = None
+
+    def __init__(self, **kwargs):
+        """Set elements of `kwargs` as attributes."""
+        # Pop "processor_class" as it should be saved as private attribute
+        self._processor_class = kwargs.pop("processor_class", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ):
+        r"""
+        Instantiate a type of [`~processing_utils.ImageProcessingMixin`] from an image processor.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a image processor file saved using the
+                  [`~processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved image processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model image processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+                </Tip>
+
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final image processor object. If `True`, then this
+                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
+                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are image processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+
+        Returns:
+            A image processor of type [`~processing_utils.ImageProcessingMixin`].
+
+        Examples:
+
+        ```python
+        # We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
+        # derived class: *CLIPImageProcessor*
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32"
+        )  # Download image_processing_config from huggingface.co and cache.
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
+        image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", do_normalize=False, foo=False
+        )
+        assert image_processor.do_normalize is False
+        image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
+        )
+        assert image_processor.do_normalize is False
+        assert unused_kwargs == {"foo": False}
+        ```"""
+        image_processor_dict, kwargs = cls.get_image_processor_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        return cls.from_dict(image_processor_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
+        """
+        Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~processing_utils.ImageProcessingMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the image processor JSON file will be saved (will be created if it does not exist).
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(
+                f"Provided path ({save_directory}) should be a directory, not a file"
+            )
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
+
+        self.to_json_file(output_image_processor_file)
+        logger.info(f"Image processor saved in {output_image_processor_file}")
+
+        return [output_image_processor_file]
+
+    def save_to_hf_hub(
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        subfolder: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        revision: Optional[str] = None,
+        create_pr: bool = False,
+    ):
+        """
+        Uploads all elements of this processor to a new HuggingFace Hub repository.
+        Args:
+            repo_id (str): Repository name for your processor in the Hub.
+            private (bool, optional): Whether theprocessor is set to private
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            commit_message (str, optional) — The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub"
+            revision (str, optional) — The git revision to commit from. Defaults to the head of the "main" branch.
+            create_pr (boolean, optional) — Whether or not to create a Pull Request with that commit. Defaults to False.
+                If revision is not set, PR is opened against the "main" branch. If revision is set and is a branch, PR is opened against this branch.
+                If revision is set and is not a branch name (example: a commit oid), an RevisionNotFoundError is returned by the server.
+
+        Returns: The url of the commit of your model in the given repository.
+        """
+        repo_url = create_repo(repo_id, private=private, exist_ok=True)
+
+        # Infer complete repo_id from repo_url
+        # Can be different from the input `repo_id` if repo_owner was implicit
+        _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url)
+
+        repo_id = f"{repo_owner}/{repo_name}"
+
+        # Check if README file already exist in repo
+        try:
+            get_hf_file_metadata(
+                hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)
+            )
+            has_readme = True
+        except EntryNotFoundError:
+            has_readme = False
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir)
+            # Add readme if does not exist
+            logger.info("README.md not found, adding the default README.md")
+            if not has_readme:
+                with open(os.path.join(root_dir, "README.md"), "w") as f:
+                    f.write(f"---\nlibrary_name: paddlenlp\n---\n# {repo_id}")
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            return upload_folder(
+                repo_id=repo_id,
+                repo_type="model",
+                folder_path=root_dir,
+                commit_message=commit_message,
+                revision=revision,
+                create_pr=create_pr,
+            )
+
+    @classmethod
+    def get_image_processor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            from_hf_hub (bool, optional): whether to load from Huggingface Hub
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+
+
+        Returns:
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        subfolder = kwargs.pop("subfolder", None)
+        cache_dir = resolve_cache_dir(
+            pretrained_model_name_or_path, from_hf_hub, cache_dir
+        )
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            resolved_image_processor_file = os.path.join(
+                pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME
+            )
+        elif os.path.isfile(pretrained_model_name_or_path):
+            resolved_image_processor_file = pretrained_model_name_or_path
+            is_local = True
+        elif from_hf_hub:
+            image_processor_file = IMAGE_PROCESSOR_NAME
+            resolved_image_processor_file = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                filename=image_processor_file,
+                cache_dir=cache_dir,
+                subfolder=subfolder,
+                library_name="PaddleNLP",
+                library_version=__version__,
+            )
+        else:
+            # Assuming from community-contributed pretrained models
+            image_processor_file = "/".join(
+                [
+                    COMMUNITY_MODEL_PREFIX,
+                    pretrained_model_name_or_path,
+                    IMAGE_PROCESSOR_NAME,
+                ]
+            )
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_image_processor_file = get_path_from_url_with_filelock(
+                    image_processor_file, cache_dir
+                )
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
+                    " it from 'BOS', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
+                )
+
+        try:
+            # Load image_processor dict
+            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
+            )
+
+        if is_local:
+            logger.info(f"loading configuration file {resolved_image_processor_file}")
+        else:
+            logger.info(
+                f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
+            )
+
+        return image_processor_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Instantiates a type of [`~processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
+
+        Args:
+            image_processor_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                [`~processing_utils.ImageProcessingMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the image processor object.
+
+        Returns:
+            [`~processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
+            parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        image_processor = cls(**image_processor_dict)
+
+        # Update image_processor with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(image_processor, key):
+                setattr(image_processor, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info(f"Image processor {image_processor}")
+        if return_unused_kwargs:
+            return image_processor, kwargs
+        else:
+            return image_processor
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["image_processor_type"] = self.__class__.__name__
+
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]):
+        """
+        Instantiates a image processor of type [`~processing_utils.ImageProcessingMixin`] from the path to a JSON
+        file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            A image processor of type [`~processing_utils.ImageProcessingMixin`]: The image_processor object
+            instantiated from that JSON file.
+        """
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        image_processor_dict = json.loads(text)
+        return cls(**image_processor_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
+        """
+        dictionary = self.to_dict()
+
+        for key, value in dictionary.items():
+            if isinstance(value, np.ndarray):
+                dictionary[key] = value.tolist()
+
+        # make sure private name "_processor_class" is correctly
+        # saved as "processor_class"
+        _processor_class = dictionary.pop("_processor_class", None)
+        if _processor_class is not None:
+            dictionary["processor_class"] = _processor_class
+
+        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this image_processor instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+
+class BaseImageProcessor(ImageProcessingMixin):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, images, **kwargs) -> BatchFeature:
+        """Preprocess an image or a batch of images."""
+        return self.preprocess(images, **kwargs)
+
+    def preprocess(self, images, **kwargs) -> BatchFeature:
+        raise NotImplementedError(
+            "Each image processor must implement its own preprocess method"
+        )
+
+
+VALID_SIZE_DICT_KEYS = (
+    {"height", "width"},
+    {"shortest_edge"},
+    {"shortest_edge", "longest_edge"},
+)
+
+
+def is_valid_size_dict(size_dict):
+    if not isinstance(size_dict, dict):
+        return False
+
+    size_dict_keys = set(size_dict.keys())
+    for allowed_keys in VALID_SIZE_DICT_KEYS:
+        if size_dict_keys == allowed_keys:
+            return True
+    return False
+
+
+def convert_to_size_dict(
+    size,
+    max_size: Optional[int] = None,
+    default_to_square: bool = True,
+    height_width_order: bool = True,
+):
+    # By default, if size is an int we assume it represents a tuple of (size, size).
+    if isinstance(size, int) and default_to_square:
+        if max_size is not None:
+            raise ValueError(
+                "Cannot specify both size as an int, with default_to_square=True and max_size"
+            )
+        return {"height": size, "width": size}
+    # In other configs, if size is an int and default_to_square is False, size represents the length of
+    # the shortest edge after resizing.
+    elif isinstance(size, int) and not default_to_square:
+        size_dict = {"shortest_edge": size}
+        if max_size is not None:
+            size_dict["longest_edge"] = max_size
+        return size_dict
+    # Otherwise, if size is a tuple it's either (height, width) or (width, height)
+    elif isinstance(size, (tuple, list)) and height_width_order:
+        return {"height": size[0], "width": size[1]}
+    elif isinstance(size, (tuple, list)) and not height_width_order:
+        return {"height": size[1], "width": size[0]}
+
+    raise ValueError(f"Could not convert size input to size dict: {size}")
+
+
+def get_size_dict(
+    size: Union[int, Iterable[int], Dict[str, int]] = None,
+    max_size: Optional[int] = None,
+    height_width_order: bool = True,
+    default_to_square: bool = True,
+    param_name="size",
+) -> dict:
+    """
+    Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards
+    compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height,
+    width) or (width, height) format.
+
+    - If `size` is tuple, it is converted to `{"height": size[0], "width": size[1]}` or `{"height": size[1], "width":
+    size[0]}` if `height_width_order` is `False`.
+    - If `size` is an int, and `default_to_square` is `True`, it is converted to `{"height": size, "width": size}`.
+    - If `size` is an int and `default_to_square` is False, it is converted to `{"shortest_edge": size}`. If `max_size`
+      is set, it is added to the dict as `{"longest_edge": max_size}`.
+
+    Args:
+        size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*):
+            The `size` parameter to be cast into a size dictionary.
+        max_size (`Optional[int]`, *optional*):
+            The `max_size` parameter to be cast into a size dictionary.
+        height_width_order (`bool`, *optional*, defaults to `True`):
+            If `size` is a tuple, whether it's in (height, width) or (width, height) order.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            If `size` is an int, whether to default to a square image or not.
+    """
+    if not isinstance(size, dict):
+        size_dict = convert_to_size_dict(
+            size, max_size, default_to_square, height_width_order
+        )
+        logger.info(
+            f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}."
+            f" Converted to {size_dict}.",
+        )
+    else:
+        size_dict = size
+
+    if not is_valid_size_dict(size_dict):
+        raise ValueError(
+            f"{param_name} must have one of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size_dict.keys()}"
+        )
+    return size_dict
diff --git a/paddlevlp/processors/image_transform_utils.py b/paddlevlp/processors/image_transform_utils.py
new file mode 100644
index 00000000000000..d5221d6707f930
--- /dev/null
+++ b/paddlevlp/processors/image_transform_utils.py
@@ -0,0 +1,795 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+import warnings
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import PIL
+from paddle.vision.transforms import functional as F
+from PIL import Image
+
+from .image_utils import (ChannelDimension, ImageInput, PILImageResampling,
+                          TensorType, get_channel_dimension_axis,
+                          get_image_size, infer_channel_dimension_format,
+                          to_numpy_array)
+from .utils import ExplicitEnum
+
+
+def is_paddle_tensor(tensor):
+    return paddle.is_tensor(tensor)
+
+
+def to_channel_dimension_format(
+    image: np.ndarray,
+    channel_dim: Union[ChannelDimension, str],
+    input_channel_dim: Optional[Union[ChannelDimension, str]] = None,
+) -> np.ndarray:
+    """
+    Converts `image` to the channel dimension format specified by `channel_dim`.
+
+    Args:
+        image (`numpy.ndarray`):
+            The image to have its channel dimension set.
+        channel_dim (`ChannelDimension`):
+            The channel dimension format to use.
+
+    Returns:
+        `np.ndarray`: The image with the channel dimension set to `channel_dim`.
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    if input_channel_dim is None:
+        input_channel_dim = infer_channel_dimension_format(image)
+
+    target_channel_dim = ChannelDimension(channel_dim)
+    if input_channel_dim == target_channel_dim:
+        return image
+
+    if target_channel_dim == ChannelDimension.FIRST:
+        image = image.transpose((2, 0, 1))
+    elif target_channel_dim == ChannelDimension.LAST:
+        image = image.transpose((1, 2, 0))
+    else:
+        raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
+
+    return image
+
+
+def rescale(
+    image: np.ndarray,
+    scale: float,
+    data_format: Optional[ChannelDimension] = None,
+    dtype=np.float32,
+) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to rescale.
+        scale (`float`):
+            The scale to use for rescaling the image.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the image. If not provided, it will be the same as the input image.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
+            extractors.
+
+    Returns:
+        `np.ndarray`: The rescaled image.
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    rescaled_image = image * scale
+    if data_format is not None:
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+    rescaled_image = rescaled_image.astype(dtype)
+    return rescaled_image
+
+
+def to_pil_image(
+    image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"],
+    do_rescale: Optional[bool] = None,
+) -> "PIL.Image.Image":
+    """
+    Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+    needed.
+
+    Args:
+        image (`PIL.Image.Image` or `numpy.ndarray` or `paddle.Tensor`):
+            The image to convert to the `PIL.Image` format.
+        do_rescale (`bool`, *optional*):
+            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
+            to `True` if the image type is a floating type, `False` otherwise.
+
+    Returns:
+        `PIL.Image.Image`: The converted image.
+    """
+    if isinstance(image, PIL.Image.Image):
+        return image
+
+    # Convert all tensors to numpy arrays before converting to PIL image
+    if is_paddle_tensor(image):
+        image = image.numpy()
+    elif not isinstance(image, np.ndarray):
+        raise ValueError("Input image type not supported: {}".format(type(image)))
+
+    # If the channel as been moved to first dim, we put it back at the end.
+    image = to_channel_dimension_format(image, ChannelDimension.LAST)
+
+    # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
+    image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
+
+    # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
+    do_rescale = (
+        isinstance(image.flat[0], (float, np.float32, np.float64))
+        if do_rescale is None
+        else do_rescale
+    )
+    if do_rescale:
+        image = rescale(image, 255)
+    image = image.astype(np.uint8)
+    return PIL.Image.fromarray(image)
+
+
+# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    default_to_square: bool = True,
+    max_size: Optional[int] = None,
+) -> tuple:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+
+    Returns:
+        `tuple`: The target (height, width) dimension of the output image after resizing.
+    """
+    if isinstance(size, (tuple, list)):
+        if len(size) == 2:
+            return tuple(size)
+        elif len(size) == 1:
+            # Perform same logic as if size was an int
+            size = size[0]
+        else:
+            raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
+
+    if default_to_square:
+        return (size, size)
+
+    height, width = get_image_size(input_image)
+    short, long = (width, height) if width <= height else (height, width)
+    requested_new_short = size
+
+    new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+
+    if max_size is not None:
+        if max_size <= requested_new_short:
+            raise ValueError(
+                f"max_size = {max_size} must be strictly greater than the requested "
+                f"size for the smaller edge size = {size}"
+            )
+        if new_long > max_size:
+            new_short, new_long = int(max_size * new_short / new_long), max_size
+
+    return (new_long, new_short) if width <= height else (new_short, new_long)
+
+
+def resize(
+    image,
+    size: Tuple[int, int],
+    resample: "PILImageResampling" = None,
+    reducing_gap: Optional[int] = None,
+    data_format: Optional[ChannelDimension] = None,
+    return_numpy: bool = True,
+) -> np.ndarray:
+    """
+    Resizes `image` to `(height, width)` specified by `size` using the PIL library.
+
+    Args:
+        image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+            The image to resize.
+        size (`Tuple[int, int]`):
+            The size to use for resizing the image.
+        resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            The filter to user for resampling.
+        reducing_gap (`int`, *optional*):
+            Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to
+            the fair resampling. See corresponding Pillow documentation for more details.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+        return_numpy (`bool`, *optional*, defaults to `True`):
+            Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
+            returned.
+
+    Returns:
+        `np.ndarray`: The resized image.
+    """
+    resample = resample if resample is not None else PILImageResampling.BILINEAR
+
+    if not len(size) == 2:
+        raise ValueError("size must have 2 elements")
+
+    # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+    # The resized image from PIL will always have channels last, so find the input format first.
+    data_format = (
+        infer_channel_dimension_format(image) if data_format is None else data_format
+    )
+
+    # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
+    # the pillow library to resize the image and then convert back to numpy
+    if not isinstance(image, PIL.Image.Image):
+        image = to_pil_image(image)
+    height, width = size
+    # PIL images are in the format (width, height)
+    resized_image = image.resize(
+        (width, height), resample=resample, reducing_gap=reducing_gap
+    )
+
+    if return_numpy:
+        resized_image = np.array(resized_image)
+        # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
+        # so we need to add it back if necessary.
+        resized_image = (
+            np.expand_dims(resized_image, axis=-1)
+            if resized_image.ndim == 2
+            else resized_image
+        )
+        # The image is always in channels last format after converting from a PIL image
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
+    return resized_image
+
+
+def normalize(
+    image: np.ndarray,
+    mean: Union[float, Iterable[float]],
+    std: Union[float, Iterable[float]],
+    data_format: Optional[ChannelDimension] = None,
+) -> np.ndarray:
+    """
+    Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+
+    image = (image - mean) / std
+
+    Args:
+        image (`np.ndarray`):
+            The image to normalize.
+        mean (`float` or `Iterable[float]`):
+            The mean to use for normalization.
+        std (`float` or `Iterable[float]`):
+            The standard deviation to use for normalization.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+    """
+    if isinstance(image, PIL.Image.Image):
+        warnings.warn(
+            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
+            FutureWarning,
+        )
+        # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
+        # casting to numpy array and dividing by 255.
+        image = to_numpy_array(image)
+        image = rescale(image, scale=1 / 255)
+
+    if not isinstance(image, np.ndarray):
+        raise ValueError("image must be a numpy array")
+
+    input_data_format = infer_channel_dimension_format(image)
+    channel_axis = get_channel_dimension_axis(image)
+    num_channels = image.shape[channel_axis]
+
+    if isinstance(mean, Iterable):
+        if len(mean) != num_channels:
+            raise ValueError(
+                f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}"
+            )
+    else:
+        mean = [mean] * num_channels
+    mean = np.array(mean, dtype=image.dtype)
+
+    if isinstance(std, Iterable):
+        if len(std) != num_channels:
+            raise ValueError(
+                f"std must have {num_channels} elements if it is an iterable, got {len(std)}"
+            )
+    else:
+        std = [std] * num_channels
+    std = np.array(std, dtype=image.dtype)
+
+    if input_data_format == ChannelDimension.LAST:
+        image = (image - mean) / std
+    else:
+        image = ((image.T - mean) / std).T
+
+    image = (
+        to_channel_dimension_format(image, data_format)
+        if data_format is not None
+        else image
+    )
+    return image
+
+
+def center_crop(
+    image: np.ndarray,
+    size: Tuple[int, int],
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    return_numpy: Optional[bool] = None,
+) -> np.ndarray:
+    """
+    Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
+    the size given, it will be padded (so the returned result will always be of size `size`).
+
+    Args:
+        image (`np.ndarray`):
+            The image to crop.
+        size (`Tuple[int, int]`):
+            The target size for the cropped image.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+        return_numpy (`bool`, *optional*):
+            Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the
+            previous ImageFeatureExtractionMixin method.
+                - Unset: will return the same type as the input image.
+                - `True`: will return a numpy array.
+                - `False`: will return a `PIL.Image.Image` object.
+    Returns:
+        `np.ndarray`: The cropped image.
+    """
+    if isinstance(image, PIL.Image.Image):
+        warnings.warn(
+            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
+            FutureWarning,
+        )
+        image = to_numpy_array(image)
+        return_numpy = False if return_numpy is None else return_numpy
+    else:
+        return_numpy = True if return_numpy is None else return_numpy
+
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    if not isinstance(size, Iterable) or len(size) != 2:
+        raise ValueError(
+            "size must have 2 elements representing the height and width of the output image"
+        )
+
+    input_data_format = infer_channel_dimension_format(image)
+    output_data_format = data_format if data_format is not None else input_data_format
+
+    # We perform the crop in (C, H, W) format and then convert to the output format
+    image = to_channel_dimension_format(image, ChannelDimension.FIRST)
+
+    orig_height, orig_width = get_image_size(image)
+    crop_height, crop_width = size
+    crop_height, crop_width = int(crop_height), int(crop_width)
+
+    # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
+    top = (orig_height - crop_height) // 2
+    bottom = top + crop_height
+    # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
+    left = (orig_width - crop_width) // 2
+    right = left + crop_width
+
+    # Check if cropped area is within image boundaries
+    if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width:
+        image = image[..., top:bottom, left:right]
+        image = to_channel_dimension_format(image, output_data_format)
+        return image
+
+    # Otherwise, we may need to pad if the image is too small. Oh joy...
+    new_height = max(crop_height, orig_height)
+    new_width = max(crop_width, orig_width)
+    new_shape = image.shape[:-2] + (new_height, new_width)
+    new_image = np.zeros_like(image, shape=new_shape)
+
+    # If the image is too small, pad it with zeros
+    top_pad = (new_height - orig_height) // 2
+    bottom_pad = top_pad + orig_height
+    left_pad = (new_width - orig_width) // 2
+    right_pad = left_pad + orig_width
+    new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
+
+    top += top_pad
+    bottom += top_pad
+    left += left_pad
+    right += left_pad
+
+    new_image = new_image[
+        ..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)
+    ]
+    new_image = to_channel_dimension_format(new_image, output_data_format)
+
+    if not return_numpy:
+        new_image = to_pil_image(new_image)
+
+    return new_image
+
+
+def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor":
+    center_x, center_y, width, height = bboxes_center.unbind(-1)
+    bbox_corners = paddle.stack(
+        # top left x, top left y, bottom right x, bottom right y
+        [
+            (center_x - 0.5 * width),
+            (center_y - 0.5 * height),
+            (center_x + 0.5 * width),
+            (center_y + 0.5 * height),
+        ],
+        axis=-1,
+    )
+    return bbox_corners
+
+
+def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
+    center_x, center_y, width, height = bboxes_center.T
+    bboxes_corners = np.stack(
+        # top left x, top left y, bottom right x, bottom right y
+        [
+            center_x - 0.5 * width,
+            center_y - 0.5 * height,
+            center_x + 0.5 * width,
+            center_y + 0.5 * height,
+        ],
+        axis=-1,
+    )
+    return bboxes_corners
+
+
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
+    """
+    Converts bounding boxes from center format to corners format.
+
+    center format: contains the coordinate for the center of the box and its width, height dimensions
+        (center_x, center_y, width, height)
+    corners format: contains the coodinates for the top-left and bottom-right corners of the box
+        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+    """
+    # Function is used during model forward pass, so we use the input framework if possible, without
+    # converting to numpy
+    if is_paddle_tensor(bboxes_center):
+        return _center_to_corners_format_paddle(bboxes_center)
+    elif isinstance(bboxes_center, np.ndarray):
+        return _center_to_corners_format_numpy(bboxes_center)
+
+    raise ValueError(f"Unsupported input type {type(bboxes_center)}")
+
+
+def _corners_to_center_format_paddle(
+    bboxes_corners: "paddle.Tensor",
+) -> "paddle.Tensor":
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
+    b = [
+        (top_left_x + bottom_right_x) / 2,  # center x
+        (top_left_y + bottom_right_y) / 2,  # center y
+        (bottom_right_x - top_left_x),  # width
+        (bottom_right_y - top_left_y),  # height
+    ]
+    return paddle.stack(b, axis=-1)
+
+
+def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T
+    bboxes_center = np.stack(
+        [
+            (top_left_x + bottom_right_x) / 2,  # center x
+            (top_left_y + bottom_right_y) / 2,  # center y
+            (bottom_right_x - top_left_x),  # width
+            (bottom_right_y - top_left_y),  # height
+        ],
+        axis=-1,
+    )
+    return bboxes_center
+
+
+def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
+    """
+    Converts bounding boxes from corners format to center format.
+
+    corners format: contains the coodinates for the top-left and bottom-right corners of the box
+        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+    center format: contains the coordinate for the center of the box and its the width, height dimensions
+        (center_x, center_y, width, height)
+    """
+    # Inverse function accepts different input types so implemented here too
+    if is_paddle_tensor(bboxes_corners):
+        return _corners_to_center_format_paddle(bboxes_corners)
+    elif isinstance(bboxes_corners, np.ndarray):
+        return _corners_to_center_format_numpy(bboxes_corners)
+
+    raise ValueError(f"Unsupported input type {type(bboxes_corners)}")
+
+
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def id_to_rgb(id_map):
+    """
+    Converts unique ID to RGB color.
+    """
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+class PaddingMode(ExplicitEnum):
+    """
+    Enum class for the different padding modes to use when padding images.
+    """
+
+    CONSTANT = "constant"
+    REFLECT = "reflect"
+    REPLICATE = "replicate"
+    SYMMETRIC = "symmetric"
+
+
+def pad(
+    image: np.ndarray,
+    padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
+    mode: PaddingMode = PaddingMode.CONSTANT,
+    constant_values: Union[float, Iterable[float]] = 0.0,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> np.ndarray:
+    """
+    Pads the `image` with the specified (height, width) `padding` and `mode`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to pad.
+        padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
+            Padding to apply to the edges of the height, width axes. Can be one of three formats:
+            - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
+            - `((before, after),)` yields same before and after pad for height and width.
+            - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
+        mode (`PaddingMode`):
+            The padding mode to use. Can be one of:
+                - `"constant"`: pads with a constant value.
+                - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
+                  vector along each axis.
+                - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
+                - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
+        constant_values (`float` or `Iterable[float]`, *optional*):
+            The value to use for the padding if `mode` is `"constant"`.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use same as the input image.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+
+    Returns:
+        `np.ndarray`: The padded image.
+
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+
+    def _expand_for_data_format(values):
+        """
+        Convert values to be in the format expected by np.pad based on the data format.
+        """
+        if isinstance(values, (int, float)):
+            values = ((values, values), (values, values))
+        elif isinstance(values, tuple) and len(values) == 1:
+            values = ((values[0], values[0]), (values[0], values[0]))
+        elif (
+            isinstance(values, tuple)
+            and len(values) == 2
+            and isinstance(values[0], int)
+        ):
+            values = (values, values)
+        elif (
+            isinstance(values, tuple)
+            and len(values) == 2
+            and isinstance(values[0], tuple)
+        ):
+            values = values
+        else:
+            raise ValueError(f"Unsupported format: {values}")
+
+        # add 0 for channel dimension
+        values = (
+            ((0, 0), *values)
+            if input_data_format == ChannelDimension.FIRST
+            else (*values, (0, 0))
+        )
+
+        # Add additional padding if there's a batch dimension
+        values = (0, *values) if image.ndim == 4 else values
+        return values
+
+    padding = _expand_for_data_format(padding)
+
+    if mode == PaddingMode.CONSTANT:
+        constant_values = _expand_for_data_format(constant_values)
+        image = np.pad(image, padding, mode="constant", constant_values=constant_values)
+    elif mode == PaddingMode.REFLECT:
+        image = np.pad(image, padding, mode="reflect")
+    elif mode == PaddingMode.REPLICATE:
+        image = np.pad(image, padding, mode="edge")
+    elif mode == PaddingMode.SYMMETRIC:
+        image = np.pad(image, padding, mode="symmetric")
+    else:
+        raise ValueError(f"Invalid padding mode: {mode}")
+
+    image = (
+        to_channel_dimension_format(image, data_format)
+        if data_format is not None
+        else image
+    )
+    return image
+
+
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+
+    Args:
+        image (Image):
+            The image to convert.
+    """
+
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    image = image.convert("RGB")
+    return image
+
+
+def decode_image(image_path: str) -> ImageInput:
+    """
+    Loads an image from a file.
+
+    Args:
+        image path(str): Path to the image.
+    """
+    image = Image.open(image_path)
+    return image
+
+
+def random_horizontal_flip(
+    image: np.ndarray,
+    flip_prob: float,
+) -> np.ndarray:
+    """
+    Randomly flips the image horizontally.
+
+    Args:
+    image (np.ndarray): Image to be flipped.
+    flip_prob (float): Probability that the image will be flipped.
+    """
+    if random.random() < flip_prob:
+        return F.hflip(image)
+    return image
+
+
+def get_crop_param(image, scale, ratio, attempts=10):
+    height, width = get_image_size(image)
+    area = height * width
+    np.random.seed(0)
+    random.seed(0)
+    for _ in range(attempts):
+        target_area = np.random.uniform(*scale) * area
+        log_ratio = tuple(math.log(x) for x in ratio)
+        aspect_ratio = math.exp(np.random.uniform(*log_ratio))
+
+        w = int(round(math.sqrt(target_area * aspect_ratio)))
+        h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+        if 0 < w <= width and 0 < h <= height:
+            i = random.randint(0, height - h)
+            j = random.randint(0, width - w)
+            return i, j, h, w
+
+    # Fallback to central crop
+    in_ratio = float(width) / float(height)
+    if in_ratio < min(ratio):
+        w = width
+        h = int(round(w / min(ratio)))
+    elif in_ratio > max(ratio):
+        h = height
+        w = int(round(h * max(ratio)))
+    else:
+        # return whole image
+        w = width
+        h = height
+    i = (height - h) // 2
+    j = (width - w) // 2
+    return i, j, h, w
+
+
+def random_resized_crop(
+    image: np.ndarray,
+    size: Union[int, List, Tuple],
+    scale: float = (0.08, 1.0),
+    ratio: float = (3.0 / 4, 4.0 / 3),
+    resample: "PILImageResampling" = None,
+) -> np.ndarray:
+    """
+    Crop the input data to random size and aspect ratio.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
+    After applying crop transfrom, the input data will be resized to given size.
+
+    Args:
+    image (np.ndarray): Image to be cropped.
+    size (Union[int, List, Tuple]): Size of cropped image.
+    scale (float): Random scale factor.
+    aspect (float): Random aspect ratio.
+    resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        The filter to user for resampling.
+    """
+    i, j, h, w = get_crop_param(image, scale, ratio)
+    cropped_img = F.crop(image, i, j, h, w)
+    return resize(cropped_img, size, resample)
diff --git a/paddlevlp/processors/image_utils.py b/paddlevlp/processors/image_utils.py
new file mode 100644
index 00000000000000..7fb5f606914f4a
--- /dev/null
+++ b/paddlevlp/processors/image_utils.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import UserDict
+from typing import Dict, Iterable, List, Tuple, Union
+
+import numpy as np
+import paddle
+import PIL.Image
+import PIL.ImageOps
+import requests
+from packaging import version
+
+from .utils import ExplicitEnum
+
+IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
+IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
+
+
+def is_paddle_tensor(tensor):
+    return paddle.is_tensor(tensor)
+
+
+def to_numpy(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
+    """
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_numpy(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return np.array(obj)
+    elif is_paddle_tensor(obj):
+        return obj.detach().cpu().numpy()
+    else:
+        return obj
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PILImageResampling = PIL.Image.Resampling
+else:
+    PILImageResampling = PIL.Image
+
+
+ImageInput = Union[
+    "PIL.Image.Image",
+    np.ndarray,
+    "paddle.Tensor",
+    List["PIL.Image.Image"],
+    List[np.ndarray],
+    List["paddle.Tensor"],
+]  # noqa
+
+
+class TensorType(ExplicitEnum):
+    """
+    Possible values for the `return_tensors` argument in [`PretrainedTokenizerBase.__call__`]. Useful for
+    tab-completion in an IDE.
+    """
+
+    PADDLE = "pd"
+    NUMPY = "np"
+
+
+class ChannelDimension(ExplicitEnum):
+    FIRST = "channels_first"
+    LAST = "channels_last"
+
+
+def is_valid_image(img):
+    return (
+        isinstance(img, PIL.Image.Image)
+        or isinstance(img, np.ndarray)
+        or is_paddle_tensor(img)
+    )
+
+
+def valid_images(imgs):
+    # If we have an list of images, make sure every image is valid
+    if isinstance(imgs, (list, tuple)):
+        for img in imgs:
+            if not valid_images(img):
+                return False
+    # If not a list of tuple, we have been given a single image or batched tensor of images
+    elif not is_valid_image(imgs):
+        return False
+    return True
+
+
+def is_batched(img):
+    if isinstance(img, (list, tuple)):
+        return is_valid_image(img[0])
+    return False
+
+
+def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
+    """
+    Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
+    If the input is a batch of images, it is converted to a list of images.
+    Args:
+        images (`ImageInput`):
+            Image of images to turn into a list of images.
+        expected_ndims (`int`, *optional*, defaults to 3):
+            Expected number of dimensions for a single input image. If the input image has a different number of
+            dimensions, an error is raised.
+    """
+    if is_batched(images):
+        return images
+
+    # Either the input is a single image, in which case we create a list of length 1
+    if isinstance(images, PIL.Image.Image):
+        # PIL images are never batched
+        return [images]
+
+    if is_valid_image(images):
+        if images.ndim == expected_ndims + 1:
+            # Batch of images
+            images = list(images)
+        elif images.ndim == expected_ndims:
+            # Single image
+            images = [images]
+        else:
+            raise ValueError(
+                f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
+                f" {images.ndim} dimensions."
+            )
+        return images
+    raise ValueError(
+        "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, paddle.Tensor "
+        f"but got {type(images)}."
+    )
+
+
+def to_numpy_array(img) -> np.ndarray:
+    if not is_valid_image(img):
+        raise ValueError(f"Invalid image type: {type(img)}")
+
+    if isinstance(img, PIL.Image.Image):
+        return np.array(img)
+    return to_numpy(img)
+
+
+def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
+    """
+    Infers the channel dimension format of `image`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to infer the channel dimension of.
+
+    Returns:
+        The channel dimension of the image.
+    """
+    if image.ndim == 3:
+        first_dim, last_dim = 0, 2
+    elif image.ndim == 4:
+        first_dim, last_dim = 1, 3
+    else:
+        raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
+
+    if image.shape[first_dim] in (1, 3):
+        return ChannelDimension.FIRST
+    elif image.shape[last_dim] in (1, 3):
+        return ChannelDimension.LAST
+    raise ValueError("Unable to infer channel dimension format")
+
+
+def get_channel_dimension_axis(image: np.ndarray) -> int:
+    """
+    Returns the channel dimension axis of the image.
+
+    Args:
+        image (`np.ndarray`):
+            The image to get the channel dimension axis of.
+
+    Returns:
+        The channel dimension axis of the image.
+    """
+    channel_dim = infer_channel_dimension_format(image)
+    if channel_dim == ChannelDimension.FIRST:
+        return image.ndim - 3
+    elif channel_dim == ChannelDimension.LAST:
+        return image.ndim - 1
+    raise ValueError(f"Unsupported data format: {channel_dim}")
+
+
+def get_image_size(
+    image: np.ndarray, channel_dim: ChannelDimension = None
+) -> Tuple[int, int]:
+    """
+    Returns the (height, width) dimensions of the image.
+
+    Args:
+        image (`np.ndarray`):
+            The image to get the dimensions of.
+        channel_dim (`ChannelDimension`, *optional*):
+            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
+
+    Returns:
+        A tuple of the image's height and width.
+    """
+    if channel_dim is None:
+        channel_dim = infer_channel_dimension_format(image)
+
+    if channel_dim == ChannelDimension.FIRST:
+        return image.shape[-2], image.shape[-1]
+    elif channel_dim == ChannelDimension.LAST:
+        return image.shape[-3], image.shape[-2]
+    else:
+        raise ValueError(f"Unsupported data format: {channel_dim}")
+
+
+def is_valid_annotation_coco_detection(
+    annotation: Dict[str, Union[List, Tuple]]
+) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "annotations" in annotation
+        and isinstance(annotation["annotations"], (list, tuple))
+        and (
+            # an image can have no annotations
+            len(annotation["annotations"]) == 0
+            or isinstance(annotation["annotations"][0], dict)
+        )
+    ):
+        return True
+    return False
+
+
+def is_valid_annotation_coco_panoptic(
+    annotation: Dict[str, Union[List, Tuple]]
+) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "segments_info" in annotation
+        and "file_name" in annotation
+        and isinstance(annotation["segments_info"], (list, tuple))
+        and (
+            # an image can have no segments
+            len(annotation["segments_info"]) == 0
+            or isinstance(annotation["segments_info"][0], dict)
+        )
+    ):
+        return True
+    return False
+
+
+def valid_coco_detection_annotations(
+    annotations: Iterable[Dict[str, Union[List, Tuple]]]
+) -> bool:
+    return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
+
+
+def valid_coco_panoptic_annotations(
+    annotations: Iterable[Dict[str, Union[List, Tuple]]]
+) -> bool:
+    return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
+
+
+def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
+    """
+    Loads `image` to a PIL Image.
+
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+
+    Returns:
+        `PIL.Image.Image`: A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+            # like http_huggingface_co.png
+            image = PIL.Image.open(requests.get(image, stream=True).raw)
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise ValueError(
+            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
diff --git a/paddlevlp/processors/processing_utils.py b/paddlevlp/processors/processing_utils.py
new file mode 100644
index 00000000000000..b1bd5072598189
--- /dev/null
+++ b/paddlevlp/processors/processing_utils.py
@@ -0,0 +1,538 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import os
+import tempfile
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import numpy as np
+from huggingface_hub import (create_repo, get_hf_file_metadata,
+                             hf_hub_download, hf_hub_url,
+                             repo_type_and_id_from_hf_id, upload_folder)
+from huggingface_hub.utils import EntryNotFoundError
+from paddlenlp import __version__
+from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding
+
+from paddlevlp.utils.downloader import (COMMUNITY_MODEL_PREFIX,
+                                        get_path_from_url_with_filelock,
+                                        resolve_cache_dir)
+from paddlevlp.utils.log import logger
+
+PROCESSOR_CONFIG_MAPPING = {
+    "image": "image_preprocessor_config.json",
+    "text": "text_preprocessor_config.json",
+}
+
+
+class BaseProcessingMixin(object):
+    """
+    This is an base processor mixin used to provide saving/loading functionality for sequential and feature
+    extractors.
+    """
+
+    _auto_class = None
+    input_type = None
+
+    def __init__(self, **kwargs):
+        """Set elements of `kwargs` as attributes."""
+        # Pop "processor_class" as it should be saved as private attribute
+        self._processor_class = kwargs.pop("processor_class", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ):
+        r"""
+        Instantiate a type of [`~processing_utils.BaseProcessingMixin`] from an processor.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained processor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a processor file saved using the
+                  [`~processing_utils.BaseProcessingMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the processor files and override the cached versions if
+                they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+                </Tip>
+
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final processor object. If `True`, then this
+                functions returns a `Tuple(processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not processor attributes: i.e., the part of
+                `kwargs` which has not been used to update `processor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* processor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+
+        Returns:
+            A processor of type [`~processing_utils.BaseProcessingMixin`].
+        ```"""
+        processor_dict, kwargs = cls.get_processor_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        return cls.from_dict(processor_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
+        """
+        Save an processor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~processing_utils.BaseProcessingMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the processor JSON file will be saved (will be created if it does not exist).
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(
+                f"Provided path ({save_directory}) should be a directory, not a file"
+            )
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_processor_file = os.path.join(
+            save_directory, PROCESSOR_CONFIG_MAPPING[self.input_type]
+        )
+
+        self.to_json_file(output_processor_file)
+        logger.info(f"processor saved in {output_processor_file}")
+
+        return [output_processor_file]
+
+    def save_to_hf_hub(
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        subfolder: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        revision: Optional[str] = None,
+        create_pr: bool = False,
+    ):
+        """
+        Uploads all elements of this processor to a new HuggingFace Hub repository.
+        Args:
+            repo_id (str): Repository name for your processor in the Hub.
+            private (bool, optional): Whether theprocessor is set to private
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            commit_message (str, optional) — The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub"
+            revision (str, optional) — The git revision to commit from. Defaults to the head of the "main" branch.
+            create_pr (boolean, optional) — Whether or not to create a Pull Request with that commit. Defaults to False.
+                If revision is not set, PR is opened against the "main" branch. If revision is set and is a branch, PR is opened against this branch.
+                If revision is set and is not a branch name (example: a commit oid), an RevisionNotFoundError is returned by the server.
+
+        Returns: The url of the commit of your model in the given repository.
+        """
+        repo_url = create_repo(repo_id, private=private, exist_ok=True)
+
+        # Infer complete repo_id from repo_url
+        # Can be different from the input `repo_id` if repo_owner was implicit
+        _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url)
+
+        repo_id = f"{repo_owner}/{repo_name}"
+
+        # Check if README file already exist in repo
+        try:
+            get_hf_file_metadata(
+                hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)
+            )
+            has_readme = True
+        except EntryNotFoundError:
+            has_readme = False
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir)
+            # Add readme if does not exist
+            logger.info("README.md not found, adding the default README.md")
+            if not has_readme:
+                with open(os.path.join(root_dir, "README.md"), "w") as f:
+                    f.write(f"---\nlibrary_name: paddlenlp\n---\n# {repo_id}")
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            return upload_folder(
+                repo_id=repo_id,
+                repo_type="model",
+                folder_path=root_dir,
+                commit_message=commit_message,
+                revision=revision,
+                create_pr=create_pr,
+            )
+
+    @classmethod
+    def get_processor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        processor of type [`~processor_utils.BaseProcessingMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            from_hf_hub (bool, optional): whether to load from Huggingface Hub
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+
+
+        Returns:
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        subfolder = kwargs.pop("subfolder", None)
+        cache_dir = resolve_cache_dir(
+            pretrained_model_name_or_path, from_hf_hub, cache_dir
+        )
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            resolved_processor_file = os.path.join(
+                pretrained_model_name_or_path, PROCESSOR_CONFIG_MAPPING[cls.input_type]
+            )
+        elif os.path.isfile(pretrained_model_name_or_path):
+            resolved_processor_file = pretrained_model_name_or_path
+            is_local = True
+        elif from_hf_hub:
+            processor_file = PROCESSOR_CONFIG_MAPPING[cls.input_type]
+            resolved_processor_file = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                filename=processor_file,
+                cache_dir=cache_dir,
+                subfolder=subfolder,
+                library_name="PaddleNLP",
+                library_version=__version__,
+            )
+        else:
+            # Assuming from community-contributed pretrained models
+            processor_file = "/".join(
+                [
+                    COMMUNITY_MODEL_PREFIX,
+                    pretrained_model_name_or_path,
+                    PROCESSOR_CONFIG_MAPPING[cls.input_type],
+                ]
+            )
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_processor_file = get_path_from_url_with_filelock(
+                    processor_file, cache_dir
+                )
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
+                    " it from 'BOS', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a {PROCESSOR_CONFIG_MAPPING[cls.input_type]} file"
+                )
+
+        try:
+            # Load processor dict
+            with open(resolved_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            processor_dict = json.loads(text)
+
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
+            )
+
+        if is_local:
+            logger.info(f"loading configuration file {resolved_processor_file}")
+        else:
+            logger.info(
+                f"loading configuration file {processor_file} from cache at {resolved_processor_file}"
+            )
+
+        return processor_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, processor_dict: Dict[str, Any], **kwargs):
+        """
+        Instantiates a type of [`~processing_utils.BaseProcessingMixin`] from a Python dictionary of parameters.
+
+        Args:
+            processor_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the processor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                [`~processing_utils.BaseProcessingMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the processor object.
+
+        Returns:
+            [`~processing_utils.BaseProcessingMixin`]: The processor object instantiated from those
+            parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        processor = cls(**processor_dict)
+
+        # Update processor with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(processor, key):
+                setattr(processor, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info(f"Processor {processor}")
+        if return_unused_kwargs:
+            return processor, kwargs
+        else:
+            return processor
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["processor_type"] = self.__class__.__name__
+
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]):
+        """
+        Instantiates a processor of type [`~processing_utils.BaseProcessingMixin`] from the path to a JSON
+        file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            A processor of type [`~processing_utils.BaseProcessingMixin`]: The processor object
+            instantiated from that JSON file.
+        """
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        processor_dict = json.loads(text)
+        return cls(**processor_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
+        """
+        dictionary = self.to_dict()
+
+        for key, value in dictionary.items():
+            if isinstance(value, np.ndarray):
+                dictionary[key] = value.tolist()
+
+        # make sure private name "_processor_class" is correctly
+        # saved as "processor_class"
+        _processor_class = dictionary.pop("_processor_class", None)
+        if _processor_class is not None:
+            dictionary["processor_class"] = _processor_class
+
+        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this processor instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+
+class BaseImageProcessor(BaseProcessingMixin):
+    input_type = "image"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, images, **kwargs) -> BatchEncoding:
+        """Preprocess an image or a batch of images."""
+        return self.preprocess(images, **kwargs)
+
+    def preprocess(self, images, **kwargs) -> BatchEncoding:
+        raise NotImplementedError(
+            "Each image processor must implement its own preprocess method"
+        )
+
+
+class BaseTextProcessor(BaseProcessingMixin):
+    input_type = "text"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, text, **kwargs) -> BatchEncoding:
+        """Preprocess an image or a batch of images."""
+        return self.preprocess(text, **kwargs)
+
+    def preprocess(self, text, **kwargs) -> BatchEncoding:
+        raise NotImplementedError(
+            "Each image processor must implement its own preprocess method"
+        )
+
+
+VALID_SIZE_DICT_KEYS = (
+    {"height", "width"},
+    {"shortest_edge"},
+    {"shortest_edge", "longest_edge"},
+)
+
+
+def is_valid_size_dict(size_dict):
+    if not isinstance(size_dict, dict):
+        return False
+
+    size_dict_keys = set(size_dict.keys())
+    for allowed_keys in VALID_SIZE_DICT_KEYS:
+        if size_dict_keys == allowed_keys:
+            return True
+    return False
+
+
+def convert_to_size_dict(
+    size,
+    max_size: Optional[int] = None,
+    default_to_square: bool = True,
+    height_width_order: bool = True,
+):
+    # By default, if size is an int we assume it represents a tuple of (size, size).
+    if isinstance(size, int) and default_to_square:
+        if max_size is not None:
+            raise ValueError(
+                "Cannot specify both size as an int, with default_to_square=True and max_size"
+            )
+        return {"height": size, "width": size}
+    # In other configs, if size is an int and default_to_square is False, size represents the length of
+    # the shortest edge after resizing.
+    elif isinstance(size, int) and not default_to_square:
+        size_dict = {"shortest_edge": size}
+        if max_size is not None:
+            size_dict["longest_edge"] = max_size
+        return size_dict
+    # Otherwise, if size is a tuple it's either (height, width) or (width, height)
+    elif isinstance(size, (tuple, list)) and height_width_order:
+        return {"height": size[0], "width": size[1]}
+    elif isinstance(size, (tuple, list)) and not height_width_order:
+        return {"height": size[1], "width": size[0]}
+
+    raise ValueError(f"Could not convert size input to size dict: {size}")
+
+
+def get_size_dict(
+    size: Union[int, Iterable[int], Dict[str, int]] = None,
+    max_size: Optional[int] = None,
+    height_width_order: bool = True,
+    default_to_square: bool = True,
+    param_name="size",
+) -> dict:
+    """
+    Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards
+    compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height,
+    width) or (width, height) format.
+
+    - If `size` is tuple, it is converted to `{"height": size[0], "width": size[1]}` or `{"height": size[1], "width":
+    size[0]}` if `height_width_order` is `False`.
+    - If `size` is an int, and `default_to_square` is `True`, it is converted to `{"height": size, "width": size}`.
+    - If `size` is an int and `default_to_square` is False, it is converted to `{"shortest_edge": size}`. If `max_size`
+      is set, it is added to the dict as `{"longest_edge": max_size}`.
+
+    Args:
+        size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*):
+            The `size` parameter to be cast into a size dictionary.
+        max_size (`Optional[int]`, *optional*):
+            The `max_size` parameter to be cast into a size dictionary.
+        height_width_order (`bool`, *optional*, defaults to `True`):
+            If `size` is a tuple, whether it's in (height, width) or (width, height) order.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            If `size` is an int, whether to default to a square image or not.
+    """
+    if not isinstance(size, dict):
+        size_dict = convert_to_size_dict(
+            size, max_size, default_to_square, height_width_order
+        )
+        logger.info(
+            f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}."
+            f" Converted to {size_dict}.",
+        )
+    else:
+        size_dict = size
+
+    if not is_valid_size_dict(size_dict):
+        raise ValueError(
+            f"{param_name} must have one of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size_dict.keys()}"
+        )
+    return size_dict
diff --git a/paddlevlp/processors/utils.py b/paddlevlp/processors/utils.py
new file mode 100644
index 00000000000000..34dd36fe33fea3
--- /dev/null
+++ b/paddlevlp/processors/utils.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
diff --git a/paddlevlp/trainer/__init__.py b/paddlevlp/trainer/__init__.py
new file mode 100644
index 00000000000000..bcdf4663fb4b70
--- /dev/null
+++ b/paddlevlp/trainer/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .trainer import *
diff --git a/paddlevlp/trainer/trainer.py b/paddlevlp/trainer/trainer.py
new file mode 100644
index 00000000000000..ea566a6e5b12ef
--- /dev/null
+++ b/paddlevlp/trainer/trainer.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlenlp.trainer.trainer import Trainer
diff --git a/paddlevlp/utils/__init__.py b/paddlevlp/utils/__init__.py
new file mode 100644
index 00000000000000..595add0aed9e11
--- /dev/null
+++ b/paddlevlp/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlevlp/utils/downloader.py b/paddlevlp/utils/downloader.py
new file mode 100644
index 00000000000000..3944b318ba7332
--- /dev/null
+++ b/paddlevlp/utils/downloader.py
@@ -0,0 +1,492 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import json
+import os
+import os.path as osp
+import shutil
+import tarfile
+import threading
+import time
+import uuid
+import zipfile
+from typing import Optional, Union
+
+import requests
+from filelock import FileLock
+from huggingface_hub import get_hf_file_metadata, hf_hub_url
+from huggingface_hub.utils import EntryNotFoundError
+from tqdm.auto import tqdm
+
+from .env import (DOWNLOAD_SERVER, FAILED_STATUS, HF_CACHE_HOME, MODEL_HOME,
+                  SUCCESS_STATUS)
+from .log import logger
+
+__all__ = ["get_weights_path_from_url", "resolve_cache_dir"]
+
+
+COMMUNITY_MODEL_PREFIX = os.getenv(
+    "COMMUNITY_MODEL_PREFIX", "https://bj.bcebos.com/paddlenlp/models/community"
+)
+WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
+DOWNLOAD_RETRY_LIMIT = 3
+DOWNLOAD_CHECK = False
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith("http://") or path.startswith("https://")
+
+
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+
+    Returns:
+        str: a local path to save downloaded weights.
+    Examples:
+        .. code-block:: python
+            from paddle.utils.download import get_weights_path_from_url
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
+    """Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        fullpath = _download(url, root_dir, md5sum)
+
+    if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
+        fullpath = _decompress(fullpath)
+
+    # model tokenizer config, [file-lock]
+    return fullpath
+
+
+def get_path_from_url_with_filelock(
+    url: str,
+    root_dir: str,
+    md5sum: Optional[str] = None,
+    check_exist: bool = True,
+    timeout: float = -1,
+) -> str:
+    """construct `get_path_from_url` for `model_utils` to enable downloading multiprocess-safe
+
+    Args:
+        url (str): the url of resource file
+        root_dir (str): the local download path
+        md5sum (str, optional): md5sum string for file. Defaults to None.
+        check_exist (bool, optional): whether check the file is exist. Defaults to True.
+        timeout (int, optional): the timeout for downloading. Defaults to -1.
+
+    Returns:
+        str: the path of downloaded file
+    """
+
+    os.makedirs(root_dir, exist_ok=True)
+
+    # create lock file, which is empty, under the `LOCK_FILE_HOME` directory.
+    lock_file_name = hashlib.md5((url + root_dir).encode("utf-8")).hexdigest()
+
+    # create `.lock` private directory in the cache dir
+    lock_file_path = os.path.join(root_dir, ".lock", lock_file_name)
+
+    os.makedirs(os.path.dirname(lock_file_path), exist_ok=True)
+
+    with FileLock(lock_file_path, timeout=timeout):
+        result = get_path_from_url(
+            url=url, root_dir=root_dir, md5sum=md5sum, check_exist=check_exist
+        )
+    return result
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    os.makedirs(path, exist_ok=True)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError(
+                "Download from {} failed. " "Retry limit reached".format(url)
+            )
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError(
+                "Downloading from {} failed with code "
+                "{}!".format(url, req.status_code)
+            )
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get("content-length")
+        with open(tmp_fullname, "wb") as f:
+            if total_size:
+                with tqdm(
+                    total=int(total_size), unit="B", unit_scale=True, unit_divisor=1024
+                ) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(len(chunk))
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info(
+            "File {} md5 check failed, {}(calc) != "
+            "{}(base)".format(fullname, calc_md5sum, md5sum)
+        )
+        return False
+    return True
+
+
+def _md5(text):
+    """
+    Calculate the md5 value of the input text.
+    """
+
+    md5code = hashlib.md5(text.encode())
+    return md5code.hexdigest()
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, "r")
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        files.extractall(file_dir, files.getmembers())
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        files.extractall(file_dir, files.getmembers())
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        files.extractall(os.path.join(file_dir, rootpath), files.getmembers())
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if "/" in file_path:
+            file_path = file_path.replace("/", os.sep)
+        elif "\\" in file_path:
+            file_path = file_path.replace("\\", os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
+
+
+class DownloaderCheck(threading.Thread):
+    """
+    Check the resource applicability  when downloading the models.
+    """
+
+    def __init__(self, task, command="taskflow", addition=None):
+        threading.Thread.__init__(self)
+        self.command = command
+        self.task = task
+        self.addition = addition
+        self._initialize()
+
+    def uri_path(self, server_url, api):
+        srv = server_url
+        if server_url.endswith("/"):
+            srv = server_url[:-1]
+        if api.startswith("/"):
+            srv += api
+        else:
+            api = "/" + api
+            srv += api
+        return srv
+
+    def _initialize(self):
+        etime = str(int(time.time()))
+        self.full_hash_flag = _md5(str(uuid.uuid1())[-12:])
+        self.hash_flag = _md5(str(uuid.uuid1())[9:18]) + "-" + etime
+
+    def request_check(self, task, command, addition):
+        if task is None:
+            return SUCCESS_STATUS
+        payload = {"word": self.task}
+        api_url = self.uri_path(DOWNLOAD_SERVER, "stat")
+        cache_path = os.path.join("～")
+        if os.path.exists(cache_path):
+            extra = {
+                "command": self.command,
+                "mtime": os.stat(cache_path).st_mtime,
+                "hub_name": self.hash_flag,
+                "cache_info": self.full_hash_flag,
+            }
+        else:
+            extra = {
+                "command": self.command,
+                "mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+                "hub_name": self.hash_flag,
+                "cache_info": self.full_hash_flag,
+            }
+        if addition is not None:
+            extra.update({"addition": addition})
+        try:
+            import paddle
+            import paddlenlp
+
+            payload["hub_version"] = " "
+            payload["ppnlp_version"] = paddlenlp.__version__
+            payload["paddle_version"] = paddle.__version__.split("-")[0]
+            payload["from"] = "ppnlp"
+            payload["extra"] = json.dumps(extra)
+            r = requests.get(api_url, payload, timeout=1).json()
+            if r.get("update_cache", 0) == 1:
+                return SUCCESS_STATUS
+            else:
+                return FAILED_STATUS
+        except Exception:
+            return FAILED_STATUS
+
+    def run(self):
+        self.request_check(self.task, self.command, self.addition)
+
+
+def download_check(model_id, model_class, addition=None):
+    logger.disable()
+    global DOWNLOAD_CHECK
+    if not DOWNLOAD_CHECK:
+        DOWNLOAD_CHECK = True
+        checker = DownloaderCheck(model_id, model_class, addition)
+        checker.start()
+        checker.join()
+    logger.enable()
+
+
+def url_file_exists(url: str) -> bool:
+    """check whether the url file exists
+
+        refer to: https://stackoverflow.com/questions/2486145/python-check-if-url-to-jpg-exists
+
+    Args:
+        url (str): the url of target file
+
+    Returns:
+        bool: whether the url file exists
+    """
+    if not is_url(url):
+        return False
+
+    result = requests.head(url)
+    return result.status_code == requests.codes.ok
+
+
+def hf_file_exists(
+    repo_id: str,
+    filename: str,
+    token: Union[bool, str, None] = None,
+    subfolder: Optional[str] = None,
+) -> bool:
+    """Check whether the HF file exists
+
+    Args:
+        repo_id (`str`): A namespace (user or an organization) name and a repo name separated by a `/`.
+        filename (`str`): The name of the file in the repo.
+        token (`str` or `bool`, *optional*): A token to be used for the download.
+            - If `True`, the token is read from the HuggingFace config folder.
+            - If `False` or `None`, no token is provided.
+            - If a string, it's used as the authentication token.
+        subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+    Returns:
+        bool: whether the HF file exists
+    """
+
+    url = hf_hub_url(repo_id=repo_id, filename=filename, subfolder=subfolder)
+    try:
+        _ = get_hf_file_metadata(
+            url=url,
+            token=token,
+        )
+        return True
+    except EntryNotFoundError:
+        return False
+
+
+def resolve_cache_dir(
+    pretrained_model_name_or_path: str,
+    from_hf_hub: bool,
+    cache_dir: Optional[str] = None,
+) -> str:
+    """resolve cache dir for PretrainedModel and PretrainedConfig
+
+    Args:
+        pretrained_model_name_or_path (str): the name or path of pretrained model
+        from_hf_hub (bool): if load from huggingface hub
+        cache_dir (str): cache_dir for models
+    """
+    if os.path.isdir(pretrained_model_name_or_path):
+        return pretrained_model_name_or_path
+
+    # hf hub library takes care of appending the model name so we don't append the model name
+    if from_hf_hub:
+        if cache_dir is not None:
+            return cache_dir
+        else:
+            return HF_CACHE_HOME
+    else:
+        if cache_dir is not None:
+            # since model_clas.from_pretrained calls config_clas.from_pretrained, the model_name may get appended twice
+            if cache_dir.endswith(pretrained_model_name_or_path):
+                return cache_dir
+            else:
+                return os.path.join(cache_dir, pretrained_model_name_or_path)
+        return os.path.join(MODEL_HOME, pretrained_model_name_or_path)
diff --git a/paddlevlp/utils/env.py b/paddlevlp/utils/env.py
new file mode 100644
index 00000000000000..e2ecda491afbe3
--- /dev/null
+++ b/paddlevlp/utils/env.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2023  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module is used to store environmental variables in PaddleMIX.
+PPMIX_HOME              -->  the root directory for storing PaddleMIX related data. Default to ~/.paddlemix. Users can change the
+├                            default value through the PPMIX_HOME environment variable.
+├─ MODEL_HOME              -->  Store model files.
+└─ DATA_HOME         -->  Store automatically downloaded datasets.
+"""
+import os
+
+
+def _get_user_home():
+    return os.path.expanduser("~")
+
+
+def _get_ppmix_home():
+    if "PPMIX_HOME" in os.environ:
+        home_path = os.environ["PPMIX_HOME"]
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    "The environment variable PPMIX_HOME {} is not a directory.".format(
+                        home_path
+                    )
+                )
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), ".paddlemix")
+
+
+def _get_sub_home(directory, parent_home=_get_ppmix_home()):
+    home = os.path.join(parent_home, directory)
+    if not os.path.exists(home):
+        os.makedirs(home, exist_ok=True)
+    return home
+
+
+def _get_bool_env(env_key: str, default_value: str) -> bool:
+    """get boolean environment variable, which can be "true", "True", "1"
+
+    Args:
+        env_key (str): key of env variable
+    """
+    value = os.getenv(env_key, default_value).lower()
+    return value in ["true", "1"]
+
+
+USER_HOME = _get_user_home()
+PPMIX_HOME = _get_ppmix_home()
+MODEL_HOME = _get_sub_home("models")
+HF_CACHE_HOME = os.environ.get("HUGGINGFACE_HUB_CACHE", MODEL_HOME)
+DATA_HOME = _get_sub_home("datasets")
+PACKAGE_HOME = _get_sub_home("packages")
+DOWNLOAD_SERVER = "http://paddlepaddle.org.cn/paddlehub"
+FAILED_STATUS = -1
+SUCCESS_STATUS = 0
+
+LEGACY_CONFIG_NAME = "model_config.json"
+CONFIG_NAME = "config.json"
+TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
+PYTORCH_WEIGHT_FILE_NAME = "pytorch_model.bin"
+PADDLE_WEIGHT_FILE_NAME = "model_state.pdparams"
+LORA_CONFIG_NAME = "lora_config.json"
+PREFIX_CONFIG_NAME = "prefix_config.json"
+LORA_WEIGHT_FILE_NAME = "lora_model_state.pdparams"
+PREFIX_WEIGHT_FILE_NAME = "prefix_model_state.pdparams"
+PAST_KEY_VALUES_FILE_NAME = "pre_caches.npy"
+
+# for conversion
+ENABLE_TORCH_CHECKPOINT = _get_bool_env("ENABLE_TORCH_CHECKPOINT", "true")
diff --git a/paddlevlp/utils/log.py b/paddlevlp/utils/log.py
new file mode 100644
index 00000000000000..78d2d824b99a14
--- /dev/null
+++ b/paddlevlp/utils/log.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2023  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import functools
+import logging
+import threading
+import time
+
+import colorlog
+
+loggers = {}
+
+log_config = {
+    "DEBUG": {"level": 10, "color": "purple"},
+    "INFO": {"level": 20, "color": "green"},
+    "TRAIN": {"level": 21, "color": "cyan"},
+    "EVAL": {"level": 22, "color": "blue"},
+    "WARNING": {"level": 30, "color": "yellow"},
+    "ERROR": {"level": 40, "color": "red"},
+    "CRITICAL": {"level": 50, "color": "bold_red"},
+}
+
+
+class Logger(object):
+    """
+    Deafult logger in PaddleNLP
+
+    Args:
+        name(str) : Logger name, default is 'PaddleNLP'
+    """
+
+    def __init__(self, name: str = None):
+        name = "PaddleNLP" if not name else name
+        self.logger = logging.getLogger(name)
+
+        for key, conf in log_config.items():
+            logging.addLevelName(conf["level"], key)
+            self.__dict__[key] = functools.partial(self.__call__, conf["level"])
+            self.__dict__[key.lower()] = functools.partial(self.__call__, conf["level"])
+
+        self.format = colorlog.ColoredFormatter(
+            "%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s",
+            log_colors={key: conf["color"] for key, conf in log_config.items()},
+        )
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logLevel = "DEBUG"
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._is_enable = True
+
+    def disable(self):
+        self._is_enable = False
+
+    def enable(self):
+        self._is_enable = True
+
+    def set_level(self, log_level: str):
+        assert (
+            log_level in log_config
+        ), f"Invalid log level. Choose among {log_config.keys()}"
+        self.logger.setLevel(log_level)
+
+    @property
+    def is_enable(self) -> bool:
+        return self._is_enable
+
+    def __call__(self, log_level: str, msg: str):
+        if not self.is_enable:
+            return
+
+        self.logger.log(log_level, msg)
+
+    @contextlib.contextmanager
+    def use_terminator(self, terminator: str):
+        old_terminator = self.handler.terminator
+        self.handler.terminator = terminator
+        yield
+        self.handler.terminator = old_terminator
+
+    @contextlib.contextmanager
+    def processing(self, msg: str, interval: float = 0.1):
+        """
+        Continuously print a progress bar with rotating special effects.
+
+        Args:
+            msg(str): Message to be printed.
+            interval(float): Rotation interval. Default to 0.1.
+        """
+        end = False
+
+        def _printer():
+            index = 0
+            flags = ["\\", "|", "/", "-"]
+            while not end:
+                flag = flags[index % len(flags)]
+                with self.use_terminator("\r"):
+                    self.info("{}: {}".format(msg, flag))
+                time.sleep(interval)
+                index += 1
+
+        t = threading.Thread(target=_printer)
+        t.start()
+        yield
+        end = True
+
+
+logger = Logger()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000000000..789aec7d1ae15e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+paddlenlp
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000000000..ceef21be8dcbb2
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2023  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from setuptools import find_packages, setup
+
+description = "PaddleMIX"
+
+with open("requirements.txt") as fin:
+    REQUIRED_PACKAGES = fin.read()
+
+
+def read(file: str):
+    current_dir = os.path.dirname(__file__)
+    path = os.path.join(current_dir, file)
+    with open(path, "r", encoding="utf-8") as f:
+        content = f.read().strip()
+    return content
+
+
+def read_version():
+    """read version of paddlemix"""
+    return read("VERSION")
+
+
+def read_readme():
+    return read("README.md")
+
+
+def read_requirements():
+    content = read("requirements.txt")
+    packages = content.split("\n")
+    return packages
+
+
+setup(
+    name="paddlemix",
+    packages=find_packages(),
+    version=read_version(),
+    author="PaddleMIX Team",
+    author_email="paddlemix@baidu.com",
+    description=description,
+    long_description=read_readme(),
+    long_description_content_type="text/markdown",
+    url="",
+    keywords=["paddle", "paddlemix"],
+    install_requires=REQUIRED_PACKAGES,
+    python_requires=">=3.6",
+    entry_points={
+        "console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+    license="Apache 2.0",
+)

From 46176c431fa61a3f23e24c22b8e17cb38922a41e Mon Sep 17 00:00:00 2001
From: Milen <1649759610@qq.com>
Date: Thu, 29 Jun 2023 05:37:37 +0000
Subject: [PATCH 03/10] add MiniGPT4

---
 paddlevlp/models/minigpt4/__init__.py         |   13 +
 paddlevlp/models/minigpt4/configuration.py    |  348 ++++
 paddlevlp/models/minigpt4/modeling.py         | 1775 +++++++++++++++++
 paddlevlp/processors/__init__.py              |    2 +
 .../processors/minigpt4_image_processing.py   |  284 +++
 paddlevlp/processors/minigpt4_processing.py   |  245 +++
 6 files changed, 2667 insertions(+)
 create mode 100644 paddlevlp/models/minigpt4/__init__.py
 create mode 100644 paddlevlp/models/minigpt4/configuration.py
 create mode 100644 paddlevlp/models/minigpt4/modeling.py
 create mode 100644 paddlevlp/processors/minigpt4_image_processing.py
 create mode 100644 paddlevlp/processors/minigpt4_processing.py

diff --git a/paddlevlp/models/minigpt4/__init__.py b/paddlevlp/models/minigpt4/__init__.py
new file mode 100644
index 00000000000000..595add0aed9e11
--- /dev/null
+++ b/paddlevlp/models/minigpt4/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlevlp/models/minigpt4/configuration.py b/paddlevlp/models/minigpt4/configuration.py
new file mode 100644
index 00000000000000..4f9a5ec08b782f
--- /dev/null
+++ b/paddlevlp/models/minigpt4/configuration.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" MiniGPT4 model configuration """
+import copy
+import os
+from typing import Union
+
+from paddlenlp.utils.log import logger
+from paddlenlp.transformers.auto.modeling import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+from paddlenlp.transformers.llama.configuration import LlamaConfig
+
+__all__ = ["MiniGPT4VisionConfig", "MiniGPT4QFormerConfig", "MiniGPT4Config"]
+
+
+class MiniGPT4VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MiniGPT4VisionModel`]. It is used to instantiate a
+    MiniGPT4 vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import MiniGPT4VisionConfig, MiniGPT4VisionModel
+    >>> # Initializing a MiniGPT4VisionConfig
+    >>> configuration = MiniGPT4VisionConfig()
+    >>> # Initializing a MiniGPT4VisionModel (with random weights) from the configuration above.
+    >>> model = MiniGPT4VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mimigpt4_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        projection_dim=512,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from MiniGPT4Config
+        if config_dict.get("model_type") == "minigpt4":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MiniGPT4QFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MiniGPT4QFormerModel`]. It is used to instantiate a
+    MiniGPT4 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+    Note that [`MiniGPT4QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import MiniGPT4QFormerConfig, MiniGPT4QFormerModel
+    >>> # Initializing a MiniGPT4 configuration
+    >>> configuration = MiniGPT4QFormerConfig()
+    >>> # Initializing a model (with random weights) from the configuration above
+    >>> model = MiniGPT4QFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "minigpt4_qformer"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from MiniGPT4Config
+        if config_dict.get("model_type") == "minigpt4":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MiniGPT4Config(PretrainedConfig):
+    r"""
+    [`MiniGPT4Config`] is the configuration class to store the configuration of a [`MiniGPT4ForConditionalGeneration`]. It is
+    used to instantiate a MiniGPT4 model according to the specified arguments, defining the vision model, Q-Former model
+    and language model configs.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MiniGPT4VisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MiniGPT4QFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import (
+    ...     MiniGPT4VisionConfig,
+    ...     MiniGPT4QFormerConfig,
+    ...     LlamaConfig,
+    ...     MiniGPT4Config,
+    ...     MiniGPT4ForConditionalGeneration,
+    ... )
+    >>> # Initializing a MiniGPT4Config configuration
+    >>> configuration = MiniGPT4Config()
+    >>> # Initializing a MiniGPT4ForConditionalGeneration (with random weights) from the configuration above
+    >>> model = MiniGPT4ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a MiniGPT4Config from a MiniGPT4VisionConfig, MiniGPT4QFormerConfig and any PretrainedConfig
+    >>> # Initializing MiniGPT4 vision, MiniGPT4 Q-Former and language model configurations
+    >>> vision_config = MiniGPT4VisionConfig()
+    >>> qformer_config = MiniGPT4QFormerConfig()
+    >>> text_config = LlamaConfig()
+    >>> config = MiniGPT4Config.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "minigpt4"
+    is_composition = True
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the MiniGPT4VisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the MiniGPT4QFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`LlamaConfig`).")
+        self.vision_config = MiniGPT4VisionConfig(**vision_config)
+        self.qformer_config = MiniGPT4QFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "llama"
+
+        if text_model_type == "llama":
+            self.text_config = LlamaConfig(**text_config)
+        else:
+            raise ValueError("Only llama accepted for model_type, but accepted {}.".format(text_model_type))
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: MiniGPT4VisionConfig,
+        qformer_config: MiniGPT4QFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`MiniGPT4Config`] (or a derived class) from a vision model, Q-Former and language model
+        configurations.
+        Returns:
+            [`MiniGPT4`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/paddlevlp/models/minigpt4/modeling.py b/paddlevlp/models/minigpt4/modeling.py
new file mode 100644
index 00000000000000..4239675bb7aaab
--- /dev/null
+++ b/paddlevlp/models/minigpt4/modeling.py
@@ -0,0 +1,1775 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from paddlenlp.ops import transfer_param
+from paddlenlp.utils.log import logger
+
+from paddlenlp.utils.initializer import normal_, ones_, zeros_
+from paddlenlp.transformers.activations import ACT2FN
+from paddlenlp.transformers.llama.modeling import LlamaForCausalLM
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
+from paddlenlp.transformers.model_utils import (
+    PretrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+
+MiniGPT4_PRETRAINED_MODEL_ARCHIVE_LIST = []
+
+from .configuration import MiniGPT4Config, MiniGPT4QFormerConfig, MiniGPT4VisionConfig
+
+__all__ = [
+    "MiniGPT4Model",
+    "MiniGPT4PretrainedModel",
+    "MiniGPT4QFormerModel",
+    "MiniGPT4VisionModel",
+    "MiniGPT4ForConditionalGeneration",
+]
+
+
+def Parameter(tensor):
+    return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor))
+
+
+def convert_weights_to_dtype(model, dtype: str):
+    # trying to convert model dtype if necessary
+    if dtype not in ["float16", "float32", "float64"]:
+        raise ValueError("Not supported dtype: {}., only [float16, float32, float64] supported.".format(dtype))
+    dtype_mapping = {
+        "float16": paddle.float16,
+        "float32": paddle.float32,
+        "float64": paddle.float64,
+    }
+
+    def convert_for_vit(layer):
+        if isinstance(layer, (nn.Linear, nn.Conv1D, nn.Conv2D)):
+            if layer.weight.dtype != dtype_mapping[dtype]:
+                layer.weight = transfer_param(layer.weight, restore_data=True, dtype=dtype)
+            if layer.bias is not None and layer.bias.dtype != dtype_mapping[dtype]:
+                layer.bias = transfer_param(layer.bias, restore_data=True, dtype=dtype)
+
+    if isinstance(model, MiniGPT4VisionModel):
+        model.apply(convert_for_vit)
+    elif isinstance(model, (MiniGPT4QFormerModel, LlamaForCausalLM)):
+        model.to(dtype=dtype)
+    else:
+        raise TypeError("Not support model type: {}.".format(type(model)))
+
+
+@dataclass
+class MiniGPT4ForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`MiniGPT4ForConditionalGeneration`].
+    Args:
+        loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[paddle.Tensor]] = None
+    logits: Optional[Tuple[paddle.Tensor]] = None
+    vision_outputs: Optional[paddle.Tensor] = None
+    qformer_outputs: Optional[Tuple[paddle.Tensor]] = None
+    language_model_outputs: Optional[Tuple[paddle.Tensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class MiniGPT4PretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MiniGPT4Config
+    base_model_prefix = "minigpt4"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+    ]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            normal_(module.weight, mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                zeros_(module.bias)
+
+        if isinstance(module, MiniGPT4VisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
+            trunc_normal_(module.position_embedding)
+            trunc_normal_(
+                module.class_embedding,
+            )
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MiniGPT4Encoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path, from_hf_hub: bool = False, subfolder: str = None, *args, **kwargs
+    ):
+        vit_dtype = kwargs.pop("vit_dtype", "float16")
+        qformer_dtype = kwargs.pop("qformer_dtype", "float32")
+        llama_dtype = kwargs.pop("llama_dtype", "float16")
+
+        model = super().from_pretrained(
+            pretrained_model_name_or_path, from_hf_hub=from_hf_hub, subfolder=subfolder, *args, **kwargs
+        )
+
+        logger.info("Trying to convert dtype for MiniGPT4 model, it may take a while.")
+        if isinstance(model, (MiniGPT4Model, MiniGPT4ForConditionalGeneration)):
+            convert_weights_to_dtype(model.vision_model, dtype=vit_dtype)
+            convert_weights_to_dtype(model.qformer, dtype=qformer_dtype)
+            convert_weights_to_dtype(model.language_model, dtype=llama_dtype)
+        elif isinstance(model, MiniGPT4VisionModel):
+            convert_weights_to_dtype(model, dtype=vit_dtype)
+        elif isinstance(model, MiniGPT4QFormerModel):
+            convert_weights_to_dtype(model, dtype=qformer_dtype)
+        elif isinstance(model, LlamaForCausalLM):
+            convert_weights_to_dtype(model, dtype=llama_dtype)
+        else:
+            raise TypeError("Not supported model type: {}.".format(type(model)))
+
+        return model
+
+
+class MiniGPT4VisionEmbeddings(nn.Layer):
+    def __init__(self, config: MiniGPT4VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = Parameter(paddle.randn([1, 1, self.embed_dim]))
+
+        self.patch_embedding = nn.Conv2D(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = Parameter(paddle.randn([1, self.num_positions, self.embed_dim]))
+
+    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds_shape = paddle.shape(patch_embeds)
+        patch_embeds = paddle.reshape(
+            patch_embeds, shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1]
+        ).transpose([0, 2, 1])
+
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
+        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
+        return embeddings
+
+
+class MiniGPT4Attention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False)
+
+        if config.qkv_bias:
+            q_bias = Parameter(paddle.zeros([self.embed_dim]))
+            v_bias = Parameter(paddle.zeros([self.embed_dim]))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias))
+            self.qkv.bias = Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.shape
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose(
+            [2, 0, 3, 1, 4]
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3])
+
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+class MiniGPT4MLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MiniGPT4EncoderLayer(nn.Layer):
+    def __init__(self, config: MiniGPT4Config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = MiniGPT4Attention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = MiniGPT4MLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class MiniGPT4Encoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`MiniGPT4EncoderLayer`].
+    Args:
+        config (`MiniGPT4Config`):
+            The corresponding vision configuration for the `MiniGPT4Encoder`.
+    """
+
+    def __init__(self, config: MiniGPT4Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList([MiniGPT4EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class MiniGPT4VisionModel(MiniGPT4PretrainedModel):
+    main_input_name = "pixel_values"
+    config_class = MiniGPT4VisionConfig
+
+    def __init__(self, config: MiniGPT4VisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = MiniGPT4VisionEmbeddings(config)
+        self.encoder = MiniGPT4Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class MiniGPT4QFormerMultiHeadAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.shape[1]
+            position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1])
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = paddle.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class MiniGPT4QFormerSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MiniGPT4QFormerAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = MiniGPT4QFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = MiniGPT4QFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, axis=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class MiniGPT4QFormerIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class MiniGPT4QFormerOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MiniGPT4QFormerLayer(nn.Layer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = MiniGPT4QFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = MiniGPT4QFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = MiniGPT4QFormerIntermediate(config)
+        self.output_query = MiniGPT4QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class MiniGPT4QFormerEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList(
+            [MiniGPT4QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class MiniGPT4QFormerModel(MiniGPT4PretrainedModel):
+    """
+    Querying Transformer (Q-Former), used in MiniGPT4.
+    """
+
+    def __init__(self, config: MiniGPT4QFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = MiniGPT4QFormerEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`paddle.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+        Returns:
+            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.cast(dtype=self.layernorm.weight.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
+        """
+        Invert an attention mask (e.g., switches 0. and 1.).
+        Args:
+            encoder_attention_mask (`paddle.Tensor`): An attention mask.
+        Returns:
+            `paddle.Tensor`: The inverted attention mask.
+        """
+        if encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+        # /transformer/transformer_layers.py#L270
+        encoder_extended_attention_mask = encoder_extended_attention_mask.cast(
+            dtype=self.layernorm.weight.dtype
+        )  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+        return encoder_extended_attention_mask
+
+    def get_head_mask(
+        self, head_mask: Optional[paddle.Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> paddle.Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.ndim == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
+        elif head_mask.ndim == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.cast(dtype=self.config.dtype)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    def forward(
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.layernorm(query_embeds.cast(self.layernorm.weight.dtype))
+        embedding_output = self.dropout(embedding_output)
+
+        input_shape = embedding_output.shape[:-1]
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class MiniGPT4Model(MiniGPT4PretrainedModel):
+    config_class = MiniGPT4Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: MiniGPT4Config):
+        super().__init__(config)
+
+        self.vision_model = MiniGPT4VisionModel(config.vision_config)
+
+        self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
+        self.qformer = MiniGPT4QFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = LlamaForCausalLM(config.text_config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from paddlenlp.transformers import LlamaTokenizer, MiniGPT4Model
+        >>> tokenizer = LlamaTokenizer.from_pretrained("model_name")
+        >>> tokenizer.pad_token = tokenizer.eos_token
+        >>> model = MiniGPT4Model.from_pretrained("model_name")
+        >>> model.eval()
+        >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False)
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return text_outputs
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import MinitGPT4Processor, MiniGPT4Model
+        >>> processor = MinitGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4Model.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor.process_images(images=image, return_tensors="pd")
+        >>> image_outputs = model.get_image_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return vision_outputs
+
+    def get_qformer_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import MinitGPT4Processor, MiniGPT4Model
+        >>> processor = MinitGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4Model.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor.process_images(images=image, return_tensors="pd")
+        >>> qformer_outputs = model.get_qformer_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        return query_outputs
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4Model
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4Model.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
+        second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(
+            [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1
+        )
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            logits = logits[:, -labels.shape[1] :, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+
+            loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return MiniGPT4ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+class MiniGPT4ForConditionalGeneration(MiniGPT4PretrainedModel):
+    config_class = MiniGPT4Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: MiniGPT4Config):
+        super().__init__(config)
+        self.config = config
+        self.vision_model = MiniGPT4VisionModel(config.vision_config)
+
+        self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
+        self.qformer = MiniGPT4QFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = LlamaForCausalLM(config.text_config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]:
+        r"""
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
+        second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(
+            [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1
+        )
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            logits = logits[:, -labels.shape[1] :, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+
+            loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return MiniGPT4ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            first_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The first input prompt before the tag `<ImageHere>`, it's embeddings will concat with image embeddings and the embeddings of the second_input_ids for the generation.
+            second_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The second input prompt after the tag `<ImageHere>`, it's embeddings will concat with image embeddings and the embeddings of the first_input_ids for the generation.
+            first_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The attention mask corresponding with the first_input_ids, whill will mask to avoid performing attention on padding token indices.
+            second_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The attention mask corresponding with the second_input_ids, whill will mask to avoid performing attention on padding token indices.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> generated_ids, scores= model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        """
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
+        second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(
+            [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1
+        )
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generate_kwargs
+        )
+
+        return outputs
+
+    @paddle.no_grad()
+    def encode_images(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image = processor.process_images(images=image, return_tensors="pd")
+        >>> image_features, image_attention_mask = model.encode_images(**image)
+        """
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        return language_model_inputs, language_model_attention_mask
+
+    @paddle.no_grad()
+    def generate_with_image_features(
+        self,
+        image_features: paddle.Tensor,
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        image_attention_mask: Optional[paddle.Tensor] = None,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            image_features (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Image features extracted with vit and qformer, specifically, the features extracted with the method `encoded_images`.
+            first_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The first input prompt before the tag `<ImageHere>`, it's embeddings will concat with image embeddings and the embeddings of the second_input_ids for the generation.
+            second_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The second input prompt after the tag `<ImageHere>`, it's embeddings will concat with image embeddings and the embeddings of the first_input_ids for the generation.
+            image_attention_mask (`paddle.Tensor` of shape (batch_size, image_sequence_length), *optional*):
+                The attention mask to the image_features.
+            first_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The attention mask corresponding to the first_input_ids.
+            second_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The attention mask corresponding to the second_input_ids.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name")
+        >>>  url = "https://paddlenlp.bj.bcebos.com/data/images/dog.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> processed_image = processor.process_images(images=image, return_tensors="pd")
+        >>> image_features, image_attention_mask = model.encode_images(**processed_image)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(text=text, prompt=prompt, return_tensors="pd")
+        >>> generated_ids, scores= model.generate_with_image_features(image_features, image_attention_mask=image_attention_mask, **inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        """
+        first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
+        second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
+        image_features = paddle.cast(image_features, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, image_features, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
+        if image_attention_mask is None:
+            image_attention_mask = paddle.ones(image_features.shape[:-1], dtype="int64")
+
+        attention_mask = paddle.concat([first_attention_mask, image_attention_mask, second_attention_mask], axis=1)
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generate_kwargs
+        )
+
+        return outputs
diff --git a/paddlevlp/processors/__init__.py b/paddlevlp/processors/__init__.py
index 4738e3272555e6..04006999f0b629 100644
--- a/paddlevlp/processors/__init__.py
+++ b/paddlevlp/processors/__init__.py
@@ -14,3 +14,5 @@
 # limitations under the License.
 
 from .blip_processing import *
+from .minigpt4_processing import *
+from .minigpt4_image_processing import *
diff --git a/paddlevlp/processors/minigpt4_image_processing.py b/paddlevlp/processors/minigpt4_image_processing.py
new file mode 100644
index 00000000000000..3a0b3302e9c799
--- /dev/null
+++ b/paddlevlp/processors/minigpt4_image_processing.py
@@ -0,0 +1,284 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for MiniGPT4."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from paddlenlp.transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from paddlenlp.transformers.image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from paddlenlp.transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+from paddlenlp.transformers.tokenizer_utils_base import TensorType
+
+__all__ = [
+    "MiniGPT4ImageProcessor",
+]
+
+
+class MiniGPT4ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a MiniGPT4 image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+        default_image_std = [0.26862954, 0.26130258, 0.27577711]
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else default_image_mean
+        self.image_std = image_std if image_std is not None else default_image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=True)
+        output_size = (size["width"], size["height"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean.
+            std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` while preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/paddlevlp/processors/minigpt4_processing.py b/paddlevlp/processors/minigpt4_processing.py
new file mode 100644
index 00000000000000..f71acc7e4298e9
--- /dev/null
+++ b/paddlevlp/processors/minigpt4_processing.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for MiniGPT4.
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+from PIL import Image
+
+from paddlenlp.transformers.image_processing_utils import BatchFeature
+from paddlenlp.transformers.image_utils import ImageInput
+from paddlenlp.transformers.processing_utils import ProcessorMixin
+from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding, TensorType, TextInput
+
+__all__ = [
+    "MiniGPT4Processor",
+]
+
+
+class MiniGPT4Processor(ProcessorMixin):
+    r"""
+    Constructs a MiniGPT4 processor which wraps a MiniGPT4 image processor and an llama tokenizer into a single processor.
+    [`MiniGPT4Processor`] offers all the functionalities of [`MiniGPT4ImageProcessor`] and [`LlamaTokenizer`]. See the docstring
+    of [`~MiniGPT4ImageProcessor.__call__`] and [`~LlamaTokenizer.decode`] for more information.
+
+    Args:
+        image_processor (`MiniGPT4ImageProcessor`):
+            An instance of [`MiniGPT4ImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+
+    Examples:
+    ```python
+    >>> import requests
+    >>> from PIL import Image
+
+    >>> import paddle
+    >>> from paddlenlp.transformers import MiniGPT4Processor
+
+    >>> # load processor
+    >>> minigpt4_13b_path = "model_name"
+    >>> processor = MiniGPT4Processor.from_pretrained(minigpt4_13b_path)
+    >>> print("load processor and model done!")
+
+    >>> # prepare model inputs for MiniGPT4
+    >>> url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> text = "describe this image"
+    >>> prompt = "Give the following image: <Img>ImageContent</Img>. You will be able to see the image once I provide it to you. Please answer my questions.###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+    >>> res = processor([image], text, prompt)
+    ```"""
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "MiniGPT4ImageProcessor"
+    tokenizer_class = "LlamaTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        tokenizer.model_input_names = ["input_ids", "attention_mask"]
+        tokenizer.padding_side = "right"
+        tokenizer.pad_token = tokenizer.eos_token
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.default_prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant: "
+        self.image_tag = "<ImageHere>"
+        self.text_tag = "<TextHere>"
+
+    def process_images(
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model.
+        Please refer to the docstring of the method for more information.
+        """
+        if not images:
+            raise ValueError("You have to input correct images.")
+
+        if isinstance(images, (Image.Image, np.ndarray, paddle.Tensor)):
+            images = [images]
+
+        # processing with image processor
+        processed_images = self.image_processor(images, return_tensors=return_tensors)
+
+        return processed_images
+
+    def process_texts(
+        self,
+        texts: Union[TextInput, List[TextInput]],
+        prompts: Union[TextInput, List[TextInput]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ):
+        prompts = prompts if prompts is not None else [self.default_prompt]
+
+        if (not isinstance(texts, TextInput)) and (not isinstance(texts, list)):
+            raise TypeError("Unsupported type for texts: {}, only str and list type supported.".format(type(texts)))
+        if prompts is not None and (not isinstance(prompts, TextInput)) and (not isinstance(prompts, list)):
+            raise TypeError(
+                "Unsupported type for prompts: {}, only str and list type supported.".format(type(prompts))
+            )
+
+        if isinstance(prompts, list):
+            if isinstance(texts, list) and len(prompts) != len(texts):
+                raise ValueError(
+                    "The length of prompts not is equal to texts' length: {} != {}".format(len(prompts), len(texts))
+                )
+            elif isinstance(texts, TextInput):
+                texts = [texts] * len(prompts)
+        else:
+            if isinstance(texts, TextInput):
+                texts = [texts]
+                prompts = [prompts]
+            else:
+                prompts = [prompts] * len(texts)
+
+        assemble_texts = []
+        for text, prompt in zip(texts, prompts):
+            if self.image_tag not in text:
+                if self.image_tag not in prompt:
+                    raise ValueError(
+                        "A prompt should contain a image tag `{}` to insert image embeddings. if you don't want to use prompt function, you have to input a text with the image tag `{}`.".format(
+                            self.image_tag, self.image_tag
+                        )
+                    )
+                if self.text_tag not in prompt:
+                    raise ValueError(
+                        "A prompt should contain a text tag `{}` to insert text information.".format(self.text_tag)
+                    )
+                assemble_texts.append(prompt.replace(self.text_tag, text))
+            else:
+                assemble_texts.append(text)
+
+        # processing with text tokenizer
+        first_texts, second_texts = zip(*[assemble_text.split(self.image_tag) for assemble_text in assemble_texts])
+        first_text_encoding = self.tokenizer(
+            text=first_texts, return_tensors=return_tensors, add_special_tokens=True, **kwargs
+        )
+        second_text_encoding = self.tokenizer(
+            text=second_texts, return_tensors=return_tensors, add_special_tokens=False, **kwargs
+        )
+
+        encoded_texts = BatchEncoding(
+            {
+                "first_input_ids": first_text_encoding["input_ids"],
+                "first_attention_mask": first_text_encoding["attention_mask"],
+                "second_input_ids": second_text_encoding["input_ids"],
+                "second_attention_mask": second_text_encoding["attention_mask"],
+            }
+        )
+        return encoded_texts
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: str = None,
+        prompt: str = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`LlamaTokenizer.__call__`] to prepare text for the model.
+        Please refer to the docstring of the above two methods for more information.
+        """
+        prompt = prompt if prompt is not None else self.default_prompt
+
+        if images is None and text is None:
+            raise ValueError("Images and text are None, you have to specify either images or texts.")
+        if images is not None and not isinstance(images, (Image.Image, np.ndarray, paddle.Tensor, list)):
+            raise TypeError(
+                "A type in [Image.Image, np.ndarray, paddle.Tensor, list] for images is expected, but received {}.".format(
+                    type(images)
+                )
+            )
+        if text is not None and not isinstance(text, str):
+            raise TypeError("A str type of text is expected, but received {}.".format(type(text)))
+        if prompt is not None and not isinstance(prompt, str):
+            raise TypeError("A str type of prompt is expected, but received {}.".format(type(prompt)))
+
+        if images is not None and not isinstance(images, list):
+            images = [images]
+        if text is not None and images is not None:
+            texts = [text] * len(images)
+            prompts = [prompt] * len(images)
+        elif text is not None and images is None:
+            texts = [text]
+            prompts = [prompt]
+
+        # image-only mode
+        if text is None:
+            # processing with image processor
+            processed_features = self.process_images(images, return_tensors=return_tensors, **kwargs)
+            return processed_features
+
+        # text-only mode
+        if images is None:
+            # processing with text tokenizer
+            encoded_texts = self.process_texts(texts, prompts, **kwargs)
+            return encoded_texts
+
+        # text-image mode
+        processed_features = self.image_processor(images, return_tensors=return_tensors)
+        encoded_texts = self.process_texts(texts, prompts, **kwargs)
+        processed_features.update(encoded_texts)
+
+        return processed_features
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

From 68459f6eb69e5714631e8bc926db6ea60f296aee Mon Sep 17 00:00:00 2001
From: Milen <1649759610@qq.com>
Date: Thu, 29 Jun 2023 05:59:13 +0000
Subject: [PATCH 04/10] add examples for minigpt4

---
 paddlevlp/examples/minigpt4/README.md         |  47 +++++++
 paddlevlp/examples/minigpt4/merge_weight.py   |  88 +++++++++++++
 .../minigpt4/paddle_minigpt4_instrction.md    | 117 ++++++++++++++++++
 paddlevlp/examples/minigpt4/run_predict.py    |  68 ++++++++++
 4 files changed, 320 insertions(+)
 create mode 100644 paddlevlp/examples/minigpt4/README.md
 create mode 100644 paddlevlp/examples/minigpt4/merge_weight.py
 create mode 100644 paddlevlp/examples/minigpt4/paddle_minigpt4_instrction.md
 create mode 100644 paddlevlp/examples/minigpt4/run_predict.py

diff --git a/paddlevlp/examples/minigpt4/README.md b/paddlevlp/examples/minigpt4/README.md
new file mode 100644
index 00000000000000..48c9f73840762b
--- /dev/null
+++ b/paddlevlp/examples/minigpt4/README.md
@@ -0,0 +1,47 @@
+# MiniGPT4
+
+## 1. 模型简介
+
+MiniGPT4 是一个具有图像理解能力的开源模型，其基于 Vicuna 大语言模型 以及 BLIP-2 中的VIT和Qformer模块进行训练，使得MiniGPT4 拥有类似于GPT4的非凡能力，例如详细的图像描述生成和从手写草稿创建网站。 此外 MiniGPT4 还具备一些的其他新的功能，包括根据给定图像写故事和诗歌，为图像中显示的问题提供解决方案，教用户如何根据食物照片做饭等。下图展示了MiniGPT4的模型结构， 更多信息请参考[MiniGPT4](https://arxiv.org/abs/2304.10592)。
+
+<center><img src="https://github.com/PaddlePaddle/Paddle/assets/35913314/f0306cb6-4837-4f52-8f57-a0e7e35238f6" /></center>
+
+
+## 2. 获取MiniGPT4 权重以及相关配置
+这里可以分两步：1. 获取MiniGPT4权重；2. 获取相关配置，包括模型参数说明以及tokenizer相关文件等。
+### 2.1 获取MiniGPT4权重
+目前需要用户手动下载MiniGPT4权重和并转换为相应的 Paddle 版权重，为方便转换，本项目提供了相应的操作说明和转换脚本，详情请参考[MiniGPT4 权重下载和转换说明](./paddle_minigpt4_instrction.md)。
+
+### 2.2 获取相关配置
+下载相关的配置文件，这里提供了两版配置文件，请根据你的需要，点击下载即可。
+|  files Aligned with MiniGPT4-7B  |  files Aligned with MiniGPT4-13B |
+:-------------------------------------:|:-----------------------------------:
+ [Download](https://paddlenlp.bj.bcebos.com/models/community/minigpt4-7b/minigpt4_7b.tar.gz)|[Download](https://paddlenlp.bj.bcebos.com/models/community/minigpt4-13b/minigpt4_13b.tar.gz) |
+
+
+下载之后进行解压，请将其中相关文件放至 与 MiniGPT4 权重相同的目录中。
+
+
+## 3. 模型预测
+在下载和转换好上述模型权重之后，可执行以下命令进行模型预测。其中参数 `pretrained_name_or_path` 用于指定 MiniGPT4 的保存目录。
+
+```
+python run_predict.py \
+    -- pretrained_name_or_path "your minigpt4 path"
+
+```
+
+下图这个示例展示了在使用MiniGPT-7b时的效果：
+
+输入图片：<center><img src="https://github.com/PaddlePaddle/Paddle/assets/35913314/d8070644-4713-465d-9c7e-9585024c1819" /></center>
+
+输入文本：“describe this image”
+
+输出:
+```
+The image shows two mugs with cats on them, one is black and white and the other is blue and white. The mugs are sitting on a table with a book in the background. The mugs have a whimsical, cartoon-like appearance. The cats on the mugs are looking at each other with a playful expression. The overall mood of the image is lighthearted and fun.###
+```
+
+
+## Reference
+- [MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models](https://minigpt-4.github.io/)
diff --git a/paddlevlp/examples/minigpt4/merge_weight.py b/paddlevlp/examples/minigpt4/merge_weight.py
new file mode 100644
index 00000000000000..8f74d7c6a96052
--- /dev/null
+++ b/paddlevlp/examples/minigpt4/merge_weight.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["FLAGS_use_cuda_managed_memory"] = "true"
+
+import paddle
+import torch
+
+from paddlenlp.transformers import LlamaForCausalLM
+
+
+def merge(args):
+    model_dict = {}
+    # load the first item: blip2-flan-t5-xxl
+    state_dict = paddle.load(args.blip2_path)
+    for n, p in state_dict.items():
+        if n.startswith("vision_model") or n.startswith("qformer") or n == "query_tokens":
+            model_dict[n] = p
+    print("[1/3] load ViT, qformer and query_tokens from blip2-flan-t5-xxl done!")
+
+    # load the second item: vicuna
+    llama_model = LlamaForCausalLM.from_pretrained(args.vicuna_path)
+
+    for n, p in llama_model.named_parameters():
+        new_name = "language_model." + n
+        model_dict[new_name] = p
+    print("[2/3] load vicuna(llama typel) done!")
+
+    # load the third item: minigpt4
+    minigpt4_state_dict = torch.load(args.minigpt4_path)
+    for n, p in minigpt4_state_dict["model"].items():
+        if n.startswith("llama_model.model"):
+            new_name = n.replace("llama_model.model", "language_model.llama")
+            new_p = paddle.to_tensor(p.cpu().numpy())
+            model_dict[new_name] = new_p
+
+        if n.startswith("llama_proj"):
+            new_name = n.replace("llama_proj", "language_projection")
+            if n.endswith("weight"):
+                new_p = paddle.to_tensor(p.cpu().numpy()).transpose([1, 0])
+            else:
+                new_p = paddle.to_tensor(p.cpu().numpy())
+            model_dict[new_name] = new_p
+
+    print("[3/3] load language_projection, some llama weights from minigpt4 done!")
+
+    save_path = os.path.join(args.save_path, "model_state.pdparams")
+    paddle.save(model_dict, save_path)
+    print("The checkpoint of minigpt4 has been saved to :{}".format(save_path))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--blip2_path", default="/blip2/dirname", type=str, help="The dir name of blip2-flan-t5-xxl.")
+    parser.add_argument("--vicuna_path", default="/vicuna/dirname", type=str, help="The dir name of vicuna.")
+    parser.add_argument(
+        "--minigpt4_path", default="/minigpt4/prerained_minigpt4.pth", type=str, help="The checkpoint path of vicuna."
+    )
+    parser.add_argument("--save_path", default="/save/to/dirname", type=str, help="The saving path of minigpt4.")
+    args = parser.parse_args()
+
+    args.blip2_path = os.path.join(args.blip2_path, "model_state.pdparams")
+    if not os.path.exists(args.blip2_path):
+        raise ValueError("Not found the file: {}".format(args.blip2_path))
+    if not os.path.isdir(args.vicuna_path):
+        raise ValueError("It is not a directory: {}".format(args.vicuna_path))
+    if not os.path.exists(args.minigpt4_path):
+        raise ValueError("Not found the file: {}".format(args.minigpt4_path))
+    if not os.path.exists(args.save_path):
+        os.makedirs(args.save_path)
+
+    merge(args)
diff --git a/paddlevlp/examples/minigpt4/paddle_minigpt4_instrction.md b/paddlevlp/examples/minigpt4/paddle_minigpt4_instrction.md
new file mode 100644
index 00000000000000..7b84aea48bd7c6
--- /dev/null
+++ b/paddlevlp/examples/minigpt4/paddle_minigpt4_instrction.md
@@ -0,0 +1,117 @@
+# 获取和转换 Paddle 版 MiniGPT4 权重
+
+## 1. 准备 MiniGPT4 中所有模块的权重
+
+你需要下载3个权重，以获取最终 MiniGPT4的权重，分别是：
+- Pretrained MiniGPT-4
+- Vicuna Weight
+- Blip2 Weight
+
+### 1.1 下载 MiniGPT4 的预训练权重
+
+根据你准备的Vicuna模型版本，下载预训练的MiniGPT4 权重。
+
+|  Checkpoint Aligned with Vicuna 7B  |  Checkpoint Aligned with Vicuna 13B |
+:-------------------------------------:|:-----------------------------------:
+[Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) | [Download](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link)
+
+### 1.2准备 ViT and Qformer 权重
+MiniGPT4中使用的ViT和Qformer Weight来自blip2-flan-t5-xxl，这个weight在PaddleNLP中进行了转换。 所以你可以从 PaddleNLP 下载它，你有两种下载方式进行下载：
+
+#### 1.2.1 通过 paddlenlp 方式加载
+直接通过paddlenlp的模型加载方法进行下载，下载后一般会存入 `PPNLP_HOME` 指定的目录。
+
+```python
+import os
+os.environ["CUDA_VISIBLE_DEVICES"]="0"
+
+import paddle
+from paddlenlp.transformers import Blip2Model, Blip2VisionModel, Blip2VisionConfig, Blip2QFormerConfig, Blip2QFormerModel
+
+Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xxl")
+```
+
+#### 1.2.2 直接点击下载
+可以直接进行点击下载：
+
+|  blip2-flan-t5-xxl 权重  |  点击下载 |
+:-------------------------------------:|:-----------------------------------:
+| model_state.pdparams | [Download](https://paddlenlp.bj.bcebos.com/models/community/Salesforce/blip2-flan-t5-xxl/model_state.pdparams) |
+
+### 1.3 准备 Vicuna 权重
+
+这里需要下载两个权重：Vicuna delta Weight和huggingface-formated Llama Weight。 然后你应该结合这两个重量来获得可以使用的Vicuna 权重。
+
+#### 1.3.1 下载 Vicuna delta 权重
+
+这里展示两种Vicuna delta 权重，请根据需要选择一种并点击下载。
+
+|  vicuna-7b-delta-v0  |  vicuna-13b-delta-v0 |
+:-------------------------------------:|:-----------------------------------:
+ [Download](https://huggingface.co/lmsys/vicuna-7b-delta-v0/tree/main) | [Download](https://huggingface.co/lmsys/vicuna-13b-delta-v0g)
+
+#### 1.3.2 根据以上选择的vicuna delta 权重，下载 相应的 llama 权重。
+
+|  llama-7b  |  llama-13b |
+:-------------------------------------:|:-----------------------------------:
+ [Download](https://huggingface.co/decapoda-research/llama-7b-hf/tree/main) | [Download](https://huggingface.co/decapoda-research/llama-13b-hf)
+
+
+#### 1.3.3 结合上面的两个权重，得到可以使用的 vicuna 权重
+- 为组合如上两个权重，请安装以下工具：
+
+```shell
+pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10
+```
+- 运行以下命令，获取最终可用的vicuna 权重
+
+```shell
+python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/  --target /path/to/save/working/vicuna-13b/weight/  --delta /path/to/vicuna-13bOR7b-delta-v0/
+```
+
+## 2. 将多个 pytorch 子权重文件合并为一个权重文件
+
+Pytorch版的权重文件可能是由多个子权重文件组合而成，为使用PaddleNLP进行加载并自动转换为Paddle版，需要将其合并为一个文件：
+
+### 2.1 下载MiniGPT库
+在开始之前，请确保已经下载了 [MiniGPT4](https://github.com/Vision-CAIR/MiniGPT-4.git) 库：
+
+```
+git clone https://github.com/Vision-CAIR/MiniGPT-4.git
+```
+
+### 2.2 获取完整的 vicuna 权重
+进入到MiniGPT4文件夹，执行以下代码，获取完整的 vicuna 权重文件：
+```python
+import argparse
+import os
+os.environ["CUDA_VISIBLE_DEVICES"]="0"
+os.environ["FLAGS_use_cuda_managed_memory"]="true"
+
+import torch
+from minigpt4.models.modeling_llama import LlamaForCausalLM
+
+llama_model = LlamaForCausalLM.from_pretrained("/path/to/save/working/vicuna-13b/")
+torch.save(llama_model.state_dict(), "/path/to/save/working/vicuna-13b/pytorch_model.bin")
+```
+
+## 3. 合并以上所有权重，获取最终的 Paddle 版 MiniGPT4 权重
+这里提供了一个合并以上权重的脚本，你可以通过设置相关权重路径 以获取最终的 MiniGPT4 权重。
+
+```shell
+python merge_weight.py \
+    --blip2_path "your dir name of blip2" \
+    --vicuna_path "your dir name of vicuna" \
+    --minigpt4_path "your ckpt path of minigpt4" \
+    --save_path "your dir name saving the final minigpt4"
+```
+
+**参数说明**：
+- `blip2_path`： 存放 blip2 权重的目录名
+- `vicuna_path`： 存放 vicuna_path 权重的目录名
+- `minigpt4_path`： 存放 blip2 权重的文件地址，比如./prerained_minigpt4_7b.pth
+- `save_path`： 保存 Paddle 版 MiniGPT3 权重的目录名
+
+## 3. More Reference
+
+- [MiniGPT Official Site](https://github.com/Vision-CAIR/MiniGPT-4)
diff --git a/paddlevlp/examples/minigpt4/run_predict.py b/paddlevlp/examples/minigpt4/run_predict.py
new file mode 100644
index 00000000000000..4b36089f3c91a8
--- /dev/null
+++ b/paddlevlp/examples/minigpt4/run_predict.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["FLAGS_use_cuda_managed_memory"] = "true"
+import requests
+from PIL import Image
+
+from paddlenlp.transformers import MiniGPT4ForConditionalGeneration, MiniGPT4Processor
+
+
+def predict(args):
+    # load MiniGPT4 moel and processor
+    model = MiniGPT4ForConditionalGeneration.from_pretrained(args.pretrained_name_or_path)
+    model.eval()
+    processor = MiniGPT4Processor.from_pretrained(args.pretrained_name_or_path)
+    print("load processor and model done!")
+
+    # prepare model inputs for MiniGPT4
+    url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    text = "describe this image"
+    prompt = "Give the following image: <Img>ImageContent</Img>. You will be able to see the image once I provide it to you. Please answer my questions.###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+    inputs = processor([image], text, prompt)
+
+    # generate with MiniGPT4
+    # breakpoint
+    generate_kwargs = {
+        "max_length": 300,
+        "num_beams": 1,
+        "top_p": 1.0,
+        "repetition_penalty": 1.0,
+        "length_penalty": 0,
+        "temperature": 1,
+        "decode_strategy": "greedy_search",
+        "eos_token_id": [[835], [2277, 29937]],
+    }
+    outputs = model.generate(**inputs, **generate_kwargs)
+    msg = processor.batch_decode(outputs[0])
+    print("Inference result: ", msg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_name_or_path",
+        default="your directory of minigpt4",
+        type=str,
+        help="The dir name of minigpt4 checkpoint.",
+    )
+    args = parser.parse_args()
+
+    predict(args)

From 3d50d5396f62a1f8124af22730c3ec2420e52d96 Mon Sep 17 00:00:00 2001
From: Milen <1649759610@qq.com>
Date: Thu, 29 Jun 2023 11:45:53 +0000
Subject: [PATCH 05/10] [New Feature] drop some paddlenlp and add some files

---
 paddlevlp/activations.py                      | 174 +++++
 paddlevlp/examples/minigpt4/run_predict.py    |   3 +-
 paddlevlp/models/__init__.py                  |   2 +
 paddlevlp/models/minigpt4/configuration.py    |   2 +-
 paddlevlp/models/minigpt4/modeling.py         |  10 +-
 paddlevlp/processors/base_processing.py       | 140 ++++
 paddlevlp/processors/image_transforms.py      | 656 ++++++++++++++++++
 .../processors/minigpt4_image_processing.py   |   9 +-
 paddlevlp/processors/minigpt4_processing.py   |   7 +-
 paddlevlp/processors/utils.py                 |   1 -
 paddlevlp/utils/initializer.py                | 421 +++++++++++
 paddlevlp/utils/log.py                        |   2 +-
 paddlevlp/utils/parameters.py                 |  53 ++
 13 files changed, 1463 insertions(+), 17 deletions(-)
 create mode 100644 paddlevlp/activations.py
 create mode 100644 paddlevlp/processors/base_processing.py
 create mode 100644 paddlevlp/processors/image_transforms.py
 create mode 100644 paddlevlp/utils/initializer.py
 create mode 100644 paddlevlp/utils/parameters.py

diff --git a/paddlevlp/activations.py b/paddlevlp/activations.py
new file mode 100644
index 00000000000000..db1aecc829d96a
--- /dev/null
+++ b/paddlevlp/activations.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+
+
+class NewGELUActivation(nn.Layer):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return (
+            0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+        )
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class FastGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+
+
+class QuickGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input * F.sigmoid(1.702 * input)
+
+
+class ClippedGELUActivation(nn.Layer):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://arxiv.org/abs/2004.09602.
+
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x: Tensor) -> Tensor:
+        return paddle.clip(gelu(x), self.min, self.max)
+
+
+class SiLUActivation(nn.Layer):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.silu(input)
+
+
+class MishActivation(nn.Layer):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.mish(input)
+
+
+class LinearActivation(nn.Layer):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+
+
+ACT2CLS = {
+    "gelu": GELUActivation,
+    "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
+    "gelu_fast": FastGELUActivation,
+    "gelu_new": NewGELUActivation,
+    "gelu_python": (GELUActivation, {"use_gelu_python": True}),
+    "linear": LinearActivation,
+    "mish": MishActivation,
+    "quick_gelu": QuickGELUActivation,
+    "relu": nn.ReLU,
+    "relu6": nn.ReLU6,
+    "sigmoid": nn.Sigmoid,
+    "silu": SiLUActivation,
+    "swish": SiLUActivation,
+    "tanh": nn.Tanh,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+
+
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+
+
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")
\ No newline at end of file
diff --git a/paddlevlp/examples/minigpt4/run_predict.py b/paddlevlp/examples/minigpt4/run_predict.py
index 4b36089f3c91a8..cb9ac139002753 100644
--- a/paddlevlp/examples/minigpt4/run_predict.py
+++ b/paddlevlp/examples/minigpt4/run_predict.py
@@ -20,7 +20,7 @@
 import requests
 from PIL import Image
 
-from paddlenlp.transformers import MiniGPT4ForConditionalGeneration, MiniGPT4Processor
+from paddlevlp import MiniGPT4ForConditionalGeneration, MiniGPT4Processor
 
 
 def predict(args):
@@ -39,7 +39,6 @@ def predict(args):
     inputs = processor([image], text, prompt)
 
     # generate with MiniGPT4
-    # breakpoint
     generate_kwargs = {
         "max_length": 300,
         "num_beams": 1,
diff --git a/paddlevlp/models/__init__.py b/paddlevlp/models/__init__.py
index 904dfbb7a6d3d2..77ef10b5801c9c 100644
--- a/paddlevlp/models/__init__.py
+++ b/paddlevlp/models/__init__.py
@@ -14,3 +14,5 @@
 # limitations under the license.
 
 from .blip2.modeling import *
+from .minigpt4.configuration import *
+from .minigpt4.modeling import *
\ No newline at end of file
diff --git a/paddlevlp/models/minigpt4/configuration.py b/paddlevlp/models/minigpt4/configuration.py
index 4f9a5ec08b782f..9ba18709ca2c60 100644
--- a/paddlevlp/models/minigpt4/configuration.py
+++ b/paddlevlp/models/minigpt4/configuration.py
@@ -17,7 +17,7 @@
 import os
 from typing import Union
 
-from paddlenlp.utils.log import logger
+from ...utils.log import logger
 from paddlenlp.transformers.auto.modeling import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
 from paddlenlp.transformers.llama.configuration import LlamaConfig
diff --git a/paddlevlp/models/minigpt4/modeling.py b/paddlevlp/models/minigpt4/modeling.py
index 4239675bb7aaab..ad647b3279459c 100644
--- a/paddlevlp/models/minigpt4/modeling.py
+++ b/paddlevlp/models/minigpt4/modeling.py
@@ -22,11 +22,6 @@
 from paddle.distributed.fleet.utils import recompute
 from paddle.nn import CrossEntropyLoss
 
-from paddlenlp.ops import transfer_param
-from paddlenlp.utils.log import logger
-
-from paddlenlp.utils.initializer import normal_, ones_, zeros_
-from paddlenlp.transformers.activations import ACT2FN
 from paddlenlp.transformers.llama.modeling import LlamaForCausalLM
 from paddlenlp.transformers.model_outputs import (
     BaseModelOutput,
@@ -42,6 +37,11 @@
     prune_linear_layer,
 )
 
+from ...utils.log import logger
+from ...activations import ACT2FN
+from ...utils.initializer import normal_, ones_, zeros_
+from ...utils.parameters import transfer_param
+
 MiniGPT4_PRETRAINED_MODEL_ARCHIVE_LIST = []
 
 from .configuration import MiniGPT4Config, MiniGPT4QFormerConfig, MiniGPT4VisionConfig
diff --git a/paddlevlp/processors/base_processing.py b/paddlevlp/processors/base_processing.py
new file mode 100644
index 00000000000000..7c599e12b5b3ea
--- /dev/null
+++ b/paddlevlp/processors/base_processing.py
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Processing saving/loading class for common processors.
+"""
+
+import os
+
+import paddlenlp.transformers
+import paddlevlp.processors
+
+
+class ProcessorMixin(object):
+    """
+    This is a mixin used to provide saving/loading functionality for all processor classes.
+    """
+
+    attributes = ["feature_extractor", "tokenizer"]
+    # Names need to be attr_class for attr in attributes
+    feature_extractor_class = None
+    tokenizer_class = None
+    _auto_class = None
+
+    # args have to match the attributes class attribute
+    def __init__(self, *args, **kwargs):
+        # Sanitize args and kwargs
+        for key in kwargs:
+            if key not in self.attributes:
+                raise TypeError(f"Unexepcted keyword argument {key}.")
+        for arg, attribute_name in zip(args, self.attributes):
+            if attribute_name in kwargs:
+                raise TypeError(f"Got multiple values for argument {attribute_name}.")
+            else:
+                kwargs[attribute_name] = arg
+
+        if len(kwargs) != len(self.attributes):
+            raise ValueError(
+                f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
+                f"{len(args)} arguments instead."
+            )
+
+        # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
+        for attribute_name, arg in kwargs.items():
+            setattr(self, attribute_name, arg)
+
+    def __repr__(self):
+        attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
+        attributes_repr = "\n".join(attributes_repr)
+        return f"{self.__class__.__name__}:\n{attributes_repr}"
+
+    def save_pretrained(self, save_directory, **kwargs):
+        """
+        Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
+        can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
+
+        <Tip>
+
+        This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
+        above for more information.
+
+        </Tip>
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
+                be created if it does not exist).
+            kwargs:
+                Additional key word arguments.
+        """
+        os.makedirs(save_directory, exist_ok=True)
+
+        for attribute_name in self.attributes:
+            attribute = getattr(self, attribute_name)
+            # Include the processor class in the attribute config so this processor can then be reloaded with the
+            # `AutoProcessor` API.
+            if hasattr(attribute, "_set_processor_class"):
+                attribute._set_processor_class(self.__class__.__name__)
+            attribute.save_pretrained(save_directory)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate a processor associated with a pretrained model.
+
+        <Tip>
+
+        This class method is simply calling the feature extractor
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
+        methods above for more information.
+
+        </Tip>
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the name of a community-contributed pretrained or built-in pretrained model.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            **kwargs
+                Additional keyword arguments passed along to both
+                [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+        """
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls(*args)
+
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        args = []
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+			# attribute class in paddlevlp has higher priority, usually used by vision class
+            attribute_class = getattr(paddlevlp.processors, class_name, None)
+            if attribute_class is None: 
+                attribute_class = getattr(paddlenlp.transformers, class_name)
+            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        return args
+
+    @property
+    def model_input_names(self):
+        first_attribute = getattr(self, self.attributes[0])
+        return getattr(first_attribute, "model_input_names", None)
\ No newline at end of file
diff --git a/paddlevlp/processors/image_transforms.py b/paddlevlp/processors/image_transforms.py
new file mode 100644
index 00000000000000..c090cc4758cb27
--- /dev/null
+++ b/paddlevlp/processors/image_transforms.py
@@ -0,0 +1,656 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import PIL
+
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_channel_dimension_axis,
+    get_image_size,
+    infer_channel_dimension_format,
+    to_numpy_array,
+)
+
+from paddlenlp.transformers.tokenizer_utils_base import ExplicitEnum, TensorType
+
+
+def is_paddle_tensor(tensor):
+    return paddle.is_tensor(tensor)
+
+
+def to_channel_dimension_format(
+    image: np.ndarray,
+    channel_dim: Union[ChannelDimension, str],
+    input_channel_dim: Optional[Union[ChannelDimension, str]] = None,
+) -> np.ndarray:
+    """
+    Converts `image` to the channel dimension format specified by `channel_dim`.
+
+    Args:
+        image (`numpy.ndarray`):
+            The image to have its channel dimension set.
+        channel_dim (`ChannelDimension`):
+            The channel dimension format to use.
+
+    Returns:
+        `np.ndarray`: The image with the channel dimension set to `channel_dim`.
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    if input_channel_dim is None:
+        input_channel_dim = infer_channel_dimension_format(image)
+
+    target_channel_dim = ChannelDimension(channel_dim)
+    if input_channel_dim == target_channel_dim:
+        return image
+
+    if target_channel_dim == ChannelDimension.FIRST:
+        image = image.transpose((2, 0, 1))
+    elif target_channel_dim == ChannelDimension.LAST:
+        image = image.transpose((1, 2, 0))
+    else:
+        raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
+
+    return image
+
+
+def rescale(
+    image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, dtype=np.float32
+) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to rescale.
+        scale (`float`):
+            The scale to use for rescaling the image.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the image. If not provided, it will be the same as the input image.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
+            extractors.
+
+    Returns:
+        `np.ndarray`: The rescaled image.
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    rescaled_image = image * scale
+    if data_format is not None:
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+    rescaled_image = rescaled_image.astype(dtype)
+    return rescaled_image
+
+
+def to_pil_image(
+    image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"],
+    do_rescale: Optional[bool] = None,
+) -> "PIL.Image.Image":
+    """
+    Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+    needed.
+
+    Args:
+        image (`PIL.Image.Image` or `numpy.ndarray` or `paddle.Tensor`):
+            The image to convert to the `PIL.Image` format.
+        do_rescale (`bool`, *optional*):
+            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
+            to `True` if the image type is a floating type, `False` otherwise.
+
+    Returns:
+        `PIL.Image.Image`: The converted image.
+    """
+    if isinstance(image, PIL.Image.Image):
+        return image
+
+    # Convert all tensors to numpy arrays before converting to PIL image
+    if is_paddle_tensor(image):
+        image = image.numpy()
+    elif not isinstance(image, np.ndarray):
+        raise ValueError("Input image type not supported: {}".format(type(image)))
+
+    # If the channel as been moved to first dim, we put it back at the end.
+    image = to_channel_dimension_format(image, ChannelDimension.LAST)
+
+    # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
+    image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
+
+    # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
+    do_rescale = isinstance(image.flat[0], (float, np.float32, np.float64)) if do_rescale is None else do_rescale
+    if do_rescale:
+        image = rescale(image, 255)
+    image = image.astype(np.uint8)
+    return PIL.Image.fromarray(image)
+
+
+# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    default_to_square: bool = True,
+    max_size: Optional[int] = None,
+) -> tuple:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+
+    Returns:
+        `tuple`: The target (height, width) dimension of the output image after resizing.
+    """
+    if isinstance(size, (tuple, list)):
+        if len(size) == 2:
+            return tuple(size)
+        elif len(size) == 1:
+            # Perform same logic as if size was an int
+            size = size[0]
+        else:
+            raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
+
+    if default_to_square:
+        return (size, size)
+
+    height, width = get_image_size(input_image)
+    short, long = (width, height) if width <= height else (height, width)
+    requested_new_short = size
+
+    new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+
+    if max_size is not None:
+        if max_size <= requested_new_short:
+            raise ValueError(
+                f"max_size = {max_size} must be strictly greater than the requested "
+                f"size for the smaller edge size = {size}"
+            )
+        if new_long > max_size:
+            new_short, new_long = int(max_size * new_short / new_long), max_size
+
+    return (new_long, new_short) if width <= height else (new_short, new_long)
+
+
+def resize(
+    image,
+    size: Tuple[int, int],
+    resample: "PILImageResampling" = None,
+    reducing_gap: Optional[int] = None,
+    data_format: Optional[ChannelDimension] = None,
+    return_numpy: bool = True,
+) -> np.ndarray:
+    """
+    Resizes `image` to `(height, width)` specified by `size` using the PIL library.
+
+    Args:
+        image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+            The image to resize.
+        size (`Tuple[int, int]`):
+            The size to use for resizing the image.
+        resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            The filter to user for resampling.
+        reducing_gap (`int`, *optional*):
+            Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to
+            the fair resampling. See corresponding Pillow documentation for more details.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+        return_numpy (`bool`, *optional*, defaults to `True`):
+            Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
+            returned.
+
+    Returns:
+        `np.ndarray`: The resized image.
+    """
+    resample = resample if resample is not None else PILImageResampling.BILINEAR
+
+    if not len(size) == 2:
+        raise ValueError("size must have 2 elements")
+
+    # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+    # The resized image from PIL will always have channels last, so find the input format first.
+    data_format = infer_channel_dimension_format(image) if data_format is None else data_format
+
+    # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
+    # the pillow library to resize the image and then convert back to numpy
+    if not isinstance(image, PIL.Image.Image):
+        image = to_pil_image(image)
+    height, width = size
+    # PIL images are in the format (width, height)
+    resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)
+
+    if return_numpy:
+        resized_image = np.array(resized_image)
+        # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
+        # so we need to add it back if necessary.
+        resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
+        # The image is always in channels last format after converting from a PIL image
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
+    return resized_image
+
+
+def normalize(
+    image: np.ndarray,
+    mean: Union[float, Iterable[float]],
+    std: Union[float, Iterable[float]],
+    data_format: Optional[ChannelDimension] = None,
+) -> np.ndarray:
+    """
+    Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+
+    image = (image - mean) / std
+
+    Args:
+        image (`np.ndarray`):
+            The image to normalize.
+        mean (`float` or `Iterable[float]`):
+            The mean to use for normalization.
+        std (`float` or `Iterable[float]`):
+            The standard deviation to use for normalization.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+    """
+    if isinstance(image, PIL.Image.Image):
+        warnings.warn(
+            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
+            FutureWarning,
+        )
+        # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
+        # casting to numpy array and dividing by 255.
+        image = to_numpy_array(image)
+        image = rescale(image, scale=1 / 255)
+
+    if not isinstance(image, np.ndarray):
+        raise ValueError("image must be a numpy array")
+
+    input_data_format = infer_channel_dimension_format(image)
+    channel_axis = get_channel_dimension_axis(image)
+    num_channels = image.shape[channel_axis]
+
+    if isinstance(mean, Iterable):
+        if len(mean) != num_channels:
+            raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
+    else:
+        mean = [mean] * num_channels
+    mean = np.array(mean, dtype=image.dtype)
+
+    if isinstance(std, Iterable):
+        if len(std) != num_channels:
+            raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
+    else:
+        std = [std] * num_channels
+    std = np.array(std, dtype=image.dtype)
+
+    if input_data_format == ChannelDimension.LAST:
+        image = (image - mean) / std
+    else:
+        image = ((image.T - mean) / std).T
+
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
+    return image
+
+
+def center_crop(
+    image: np.ndarray,
+    size: Tuple[int, int],
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    return_numpy: Optional[bool] = None,
+) -> np.ndarray:
+    """
+    Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
+    the size given, it will be padded (so the returned result will always be of size `size`).
+
+    Args:
+        image (`np.ndarray`):
+            The image to crop.
+        size (`Tuple[int, int]`):
+            The target size for the cropped image.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+        return_numpy (`bool`, *optional*):
+            Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the
+            previous ImageFeatureExtractionMixin method.
+                - Unset: will return the same type as the input image.
+                - `True`: will return a numpy array.
+                - `False`: will return a `PIL.Image.Image` object.
+    Returns:
+        `np.ndarray`: The cropped image.
+    """
+    if isinstance(image, PIL.Image.Image):
+        warnings.warn(
+            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
+            FutureWarning,
+        )
+        image = to_numpy_array(image)
+        return_numpy = False if return_numpy is None else return_numpy
+    else:
+        return_numpy = True if return_numpy is None else return_numpy
+
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    if not isinstance(size, Iterable) or len(size) != 2:
+        raise ValueError("size must have 2 elements representing the height and width of the output image")
+
+    input_data_format = infer_channel_dimension_format(image)
+    output_data_format = data_format if data_format is not None else input_data_format
+
+    # We perform the crop in (C, H, W) format and then convert to the output format
+    image = to_channel_dimension_format(image, ChannelDimension.FIRST)
+
+    orig_height, orig_width = get_image_size(image)
+    crop_height, crop_width = size
+    crop_height, crop_width = int(crop_height), int(crop_width)
+
+    # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
+    top = (orig_height - crop_height) // 2
+    bottom = top + crop_height
+    # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
+    left = (orig_width - crop_width) // 2
+    right = left + crop_width
+
+    # Check if cropped area is within image boundaries
+    if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width:
+        image = image[..., top:bottom, left:right]
+        image = to_channel_dimension_format(image, output_data_format)
+        return image
+
+    # Otherwise, we may need to pad if the image is too small. Oh joy...
+    new_height = max(crop_height, orig_height)
+    new_width = max(crop_width, orig_width)
+    new_shape = image.shape[:-2] + (new_height, new_width)
+    new_image = np.zeros_like(image, shape=new_shape)
+
+    # If the image is too small, pad it with zeros
+    top_pad = (new_height - orig_height) // 2
+    bottom_pad = top_pad + orig_height
+    left_pad = (new_width - orig_width) // 2
+    right_pad = left_pad + orig_width
+    new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
+
+    top += top_pad
+    bottom += top_pad
+    left += left_pad
+    right += left_pad
+
+    new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
+    new_image = to_channel_dimension_format(new_image, output_data_format)
+
+    if not return_numpy:
+        new_image = to_pil_image(new_image)
+
+    return new_image
+
+
+def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor":
+    center_x, center_y, width, height = bboxes_center.unbind(-1)
+    bbox_corners = paddle.stack(
+        # top left x, top left y, bottom right x, bottom right y
+        [(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)],
+        axis=-1,
+    )
+    return bbox_corners
+
+
+def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
+    center_x, center_y, width, height = bboxes_center.T
+    bboxes_corners = np.stack(
+        # top left x, top left y, bottom right x, bottom right y
+        [center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
+        axis=-1,
+    )
+    return bboxes_corners
+
+
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
+    """
+    Converts bounding boxes from center format to corners format.
+
+    center format: contains the coordinate for the center of the box and its width, height dimensions
+        (center_x, center_y, width, height)
+    corners format: contains the coodinates for the top-left and bottom-right corners of the box
+        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+    """
+    # Function is used during model forward pass, so we use the input framework if possible, without
+    # converting to numpy
+    if is_paddle_tensor(bboxes_center):
+        return _center_to_corners_format_paddle(bboxes_center)
+    elif isinstance(bboxes_center, np.ndarray):
+        return _center_to_corners_format_numpy(bboxes_center)
+
+    raise ValueError(f"Unsupported input type {type(bboxes_center)}")
+
+
+def _corners_to_center_format_paddle(bboxes_corners: "paddle.Tensor") -> "paddle.Tensor":
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
+    b = [
+        (top_left_x + bottom_right_x) / 2,  # center x
+        (top_left_y + bottom_right_y) / 2,  # center y
+        (bottom_right_x - top_left_x),  # width
+        (bottom_right_y - top_left_y),  # height
+    ]
+    return paddle.stack(b, axis=-1)
+
+
+def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T
+    bboxes_center = np.stack(
+        [
+            (top_left_x + bottom_right_x) / 2,  # center x
+            (top_left_y + bottom_right_y) / 2,  # center y
+            (bottom_right_x - top_left_x),  # width
+            (bottom_right_y - top_left_y),  # height
+        ],
+        axis=-1,
+    )
+    return bboxes_center
+
+
+def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
+    """
+    Converts bounding boxes from corners format to center format.
+
+    corners format: contains the coodinates for the top-left and bottom-right corners of the box
+        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+    center format: contains the coordinate for the center of the box and its the width, height dimensions
+        (center_x, center_y, width, height)
+    """
+    # Inverse function accepts different input types so implemented here too
+    if is_paddle_tensor(bboxes_corners):
+        return _corners_to_center_format_paddle(bboxes_corners)
+    elif isinstance(bboxes_corners, np.ndarray):
+        return _corners_to_center_format_numpy(bboxes_corners)
+
+    raise ValueError(f"Unsupported input type {type(bboxes_corners)}")
+
+
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def id_to_rgb(id_map):
+    """
+    Converts unique ID to RGB color.
+    """
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+class PaddingMode(ExplicitEnum):
+    """
+    Enum class for the different padding modes to use when padding images.
+    """
+
+    CONSTANT = "constant"
+    REFLECT = "reflect"
+    REPLICATE = "replicate"
+    SYMMETRIC = "symmetric"
+
+
+def pad(
+    image: np.ndarray,
+    padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
+    mode: PaddingMode = PaddingMode.CONSTANT,
+    constant_values: Union[float, Iterable[float]] = 0.0,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> np.ndarray:
+    """
+    Pads the `image` with the specified (height, width) `padding` and `mode`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to pad.
+        padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
+            Padding to apply to the edges of the height, width axes. Can be one of three formats:
+            - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
+            - `((before, after),)` yields same before and after pad for height and width.
+            - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
+        mode (`PaddingMode`):
+            The padding mode to use. Can be one of:
+                - `"constant"`: pads with a constant value.
+                - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
+                  vector along each axis.
+                - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
+                - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
+        constant_values (`float` or `Iterable[float]`, *optional*):
+            The value to use for the padding if `mode` is `"constant"`.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use same as the input image.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+
+    Returns:
+        `np.ndarray`: The padded image.
+
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+
+    def _expand_for_data_format(values):
+        """
+        Convert values to be in the format expected by np.pad based on the data format.
+        """
+        if isinstance(values, (int, float)):
+            values = ((values, values), (values, values))
+        elif isinstance(values, tuple) and len(values) == 1:
+            values = ((values[0], values[0]), (values[0], values[0]))
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
+            values = (values, values)
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
+            values = values
+        else:
+            raise ValueError(f"Unsupported format: {values}")
+
+        # add 0 for channel dimension
+        values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
+
+        # Add additional padding if there's a batch dimension
+        values = (0, *values) if image.ndim == 4 else values
+        return values
+
+    padding = _expand_for_data_format(padding)
+
+    if mode == PaddingMode.CONSTANT:
+        constant_values = _expand_for_data_format(constant_values)
+        image = np.pad(image, padding, mode="constant", constant_values=constant_values)
+    elif mode == PaddingMode.REFLECT:
+        image = np.pad(image, padding, mode="reflect")
+    elif mode == PaddingMode.REPLICATE:
+        image = np.pad(image, padding, mode="edge")
+    elif mode == PaddingMode.SYMMETRIC:
+        image = np.pad(image, padding, mode="symmetric")
+    else:
+        raise ValueError(f"Invalid padding mode: {mode}")
+
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
+    return image
+
+
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+
+    Args:
+        image (Image):
+            The image to convert.
+    """
+
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    image = image.convert("RGB")
+    return image
\ No newline at end of file
diff --git a/paddlevlp/processors/minigpt4_image_processing.py b/paddlevlp/processors/minigpt4_image_processing.py
index 3a0b3302e9c799..08f70cd83d7f95 100644
--- a/paddlevlp/processors/minigpt4_image_processing.py
+++ b/paddlevlp/processors/minigpt4_image_processing.py
@@ -20,15 +20,17 @@
 import numpy as np
 import PIL
 
-from paddlenlp.transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from paddlenlp.transformers.image_transforms import (
+from paddlenlp.transformers.tokenizer_utils_base import TensorType
+
+from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from .image_transforms import (
     convert_to_rgb,
     normalize,
     rescale,
     resize,
     to_channel_dimension_format,
 )
-from paddlenlp.transformers.image_utils import (
+from .image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -36,7 +38,6 @@
     to_numpy_array,
     valid_images,
 )
-from paddlenlp.transformers.tokenizer_utils_base import TensorType
 
 __all__ = [
     "MiniGPT4ImageProcessor",
diff --git a/paddlevlp/processors/minigpt4_processing.py b/paddlevlp/processors/minigpt4_processing.py
index f71acc7e4298e9..0b46d01ef468d2 100644
--- a/paddlevlp/processors/minigpt4_processing.py
+++ b/paddlevlp/processors/minigpt4_processing.py
@@ -23,11 +23,12 @@
 import paddle
 from PIL import Image
 
-from paddlenlp.transformers.image_processing_utils import BatchFeature
-from paddlenlp.transformers.image_utils import ImageInput
-from paddlenlp.transformers.processing_utils import ProcessorMixin
 from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding, TensorType, TextInput
 
+from .image_processing_utils import BatchFeature
+from .image_utils import ImageInput
+from .base_processing import ProcessorMixin
+
 __all__ = [
     "MiniGPT4Processor",
 ]
diff --git a/paddlevlp/processors/utils.py b/paddlevlp/processors/utils.py
index 34dd36fe33fea3..896c4bcd24820b 100644
--- a/paddlevlp/processors/utils.py
+++ b/paddlevlp/processors/utils.py
@@ -14,7 +14,6 @@
 
 from enum import Enum
 
-
 class ExplicitEnum(Enum):
     """
     Enum with more explicit error message for missing values.
diff --git a/paddlevlp/utils/initializer.py b/paddlevlp/utils/initializer.py
new file mode 100644
index 00000000000000..f963a6de0ae25f
--- /dev/null
+++ b/paddlevlp/utils/initializer.py
@@ -0,0 +1,421 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+
+import math
+import warnings
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.fluid import core
+from paddle.fluid.core import VarDesc
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+__all__ = [
+    "uniform_",
+    "normal_",
+    "constant_",
+    "ones_",
+    "zeros_",
+    "xavier_uniform_",
+    "xavier_normal_",
+    "kaiming_uniform_",
+    "kaiming_normal_",
+    "linear_init_",
+    "conv_init_",
+    "reset_initialized_parameter",
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.uniform_(min=a, max=b)
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0.0, std=1.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.0):
+    with paddle.no_grad():
+        tensor.fill_(value)
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.0):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ["fan_in", "fan_out"]
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == "fan_in" else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = ["linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d", "conv_transpose2d", "conv_transpose3d"]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == "selu":
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1.0 / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0.0, std=1.0)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.0)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_fill_(m.bias, 0)
+
+
+def _transform(t, device, dtype, blocking):
+    if device is None:
+        device = t.place
+    if dtype is None:
+        dtype = t.dtype
+
+    if type(dtype) is not VarDesc.VarType:
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    # 1. gpu place need to determine whether the memory is sufficient for allocation:
+    if t.place.is_gpu_place():
+        # for gpu, minimum memory allocation unit is 256 bytes.
+        size_dtype = core.size_of_dtype(dtype)
+        # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
+        # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
+        waiting_alloc_memory = ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+        gpu_memory_available = core.gpu_memory_available()
+        if gpu_memory_available < waiting_alloc_memory:
+            # Copy param / Tensor to cpu
+            t_used = t._copy_to(paddle.CPUPlace(), blocking)  # k-v type will error
+            # Release mem of t
+            t.value().get_tensor()._clear()
+        else:
+            t_used = t
+    else:
+        t_used = t
+
+    # 2. cast param / Tensor to dtype
+    if dtype is not None and dtype != t_used.dtype:
+        with paddle.fluid.framework._dygraph_place_guard(place=t_used.place):
+            t_casted = t_used.cast(dtype=dtype)
+    else:
+        t_casted = t_used
+
+    # 3. Copy casted cpu param / Tensor to device
+    if device is not None and not t_casted.place._equals(device):
+        new_t = t_casted._copy_to(device, blocking)
+    else:
+        new_t = t_casted
+
+    # 4. share Tensor to origin param / Tensor
+    dst_tensor = t.value().get_tensor()
+    src_tensor = new_t.value().get_tensor()
+    dst_tensor._share_data_with(src_tensor)
+
+    return t
+
+
+def to(
+    self,
+    device=None,
+    dtype=None,
+    blocking=None,
+    floating_only=True,
+):
+    """
+    Cast the parameters and buffers of Layer by the give device, dtype and blocking.
+
+    Parameters:
+        device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored.
+        If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the
+        index of the GPUs or XPUs. Default: None.
+
+        dtype(str|numpy.dtype|paddle.dtype|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None.
+
+        blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
+            asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
+
+        floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking.
+
+    Returns:
+        self
+
+    """
+
+    if device is None and dtype is None and blocking is None:
+        return self
+
+    if device is not None:
+        if isinstance(device, str):
+            device = paddle.device._convert_to_place(device)
+        elif isinstance(
+            device,
+            (
+                core.CPUPlace,
+                core.CUDAPlace,
+                core.CUDAPinnedPlace,
+                core.XPUPlace,
+            ),
+        ):
+            pass
+        else:
+            raise ValueError(
+                "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
+                + type(device).__name__
+            )
+
+    if blocking is None:
+        blocking = True
+    else:
+        assert isinstance(blocking, bool), "blocking value error, must be the True, False or None"
+
+    def transform(t, device, dtype, blocking):
+        if floating_only and (not paddle.is_floating_point(t)):
+            return t
+        return _transform(t, device, dtype, blocking)
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning)
+        transform(self, device, dtype, blocking)
+
+    return self
\ No newline at end of file
diff --git a/paddlevlp/utils/log.py b/paddlevlp/utils/log.py
index 78d2d824b99a14..cce25443e414f3 100644
--- a/paddlevlp/utils/log.py
+++ b/paddlevlp/utils/log.py
@@ -42,7 +42,7 @@ class Logger(object):
     """
 
     def __init__(self, name: str = None):
-        name = "PaddleNLP" if not name else name
+        name = "PaddleMIX" if not name else name
         self.logger = logging.getLogger(name)
 
         for key, conf in log_config.items():
diff --git a/paddlevlp/utils/parameters.py b/paddlevlp/utils/parameters.py
new file mode 100644
index 00000000000000..168e029791da21
--- /dev/null
+++ b/paddlevlp/utils/parameters.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+def transfer_param(p, is_bias=False, dtype="float16", restore_data=False):
+    param_shape = p.shape
+    # Allow CPU/GPU and float16/float32 transfer
+    # NOTE: str(p.place) differs between paddle develop and 2.2
+    if str(p.dtype)[-len(dtype) :] == dtype and ("gpu" in str(p.place).lower() or "cuda" in str(p.place).lower()):
+        return p
+    if restore_data:
+        if (
+            getattr(paddle.fluid.framework, "_in_eager_mode_", False)
+            and getattr(paddle.fluid.framework, "_dygraph_tracer_", None) is not None
+        ) or (
+            hasattr(paddle.fluid.framework, "global_var")
+            and getattr(paddle.fluid.framework.global_var, "_in_eager_mode_", False)
+            and getattr(paddle.fluid.framework.global_var, "_dygraph_tracer_", None) is not None
+        ):
+            param_data = p.numpy()
+            new_p = paddle.create_parameter(shape=param_shape, dtype=dtype, is_bias=is_bias)
+            new_p.set_value(param_data.astype(dtype))
+            return new_p
+        elif paddle.in_dynamic_mode():
+            param_data = p.numpy()
+            # Creating parameters with Assign initializer is too slow. Maybe we
+            # can cast to fp16 directly and get a tensor, while we do it more
+            # elaborately to get a ParamBase. Also note `VarBase.set_value`
+            # enforce the same dtype and can not be used directly.
+            new_p = type(p)(shape=param_shape, dtype=dtype, is_bias=is_bias)
+            new_p.value().get_tensor().set(param_data.astype(dtype), paddle.framework._current_expected_place())
+            return new_p
+        else:
+            param_data = np.array(paddle.static.global_scope().find_var(p.name).get_tensor())
+    return paddle.create_parameter(
+        shape=param_shape,
+        dtype=dtype,
+        is_bias=is_bias,
+        default_initializer=paddle.nn.initializer.Assign(param_data) if restore_data else None,
+    )
\ No newline at end of file

From 9d94a9db8adc29960ace23bbaed33e17553bde1a Mon Sep 17 00:00:00 2001
From: LokeZhou <aishenghuoaiqq@163.com>
Date: Mon, 3 Jul 2023 07:41:49 +0000
Subject: [PATCH 06/10] add groundingdino

---
 paddlevlp/examples/groundingdino/README.md    |   25 +
 paddlevlp/examples/groundingdino/__init__.py  |   13 +
 .../examples/groundingdino/run_predict.py     |  124 ++
 paddlevlp/models/groundingdino/__init__.py    |   14 +
 .../models/groundingdino/backbone/__init__.py |    1 +
 .../models/groundingdino/backbone/backbone.py |   94 ++
 .../backbone/position_encoding.py             |  182 +++
 .../backbone/swin_transformer.py              |  897 ++++++++++++++
 paddlevlp/models/groundingdino/bert_model.py  |  715 +++++++++++
 paddlevlp/models/groundingdino/bertwarper.py  |  277 +++++
 .../models/groundingdino/configuration.py     |  168 +++
 paddlevlp/models/groundingdino/csrc/README.md |   85 ++
 .../csrc/ms_deformable_attn_op.cc             |   65 +
 .../csrc/ms_deformable_attn_op.cu             | 1073 +++++++++++++++++
 .../csrc/setup_ms_deformable_attn_op.py       |    7 +
 .../csrc/test_ms_deformable_attn_op.py        |  140 +++
 .../models/groundingdino/fuse_modules.py      |  312 +++++
 paddlevlp/models/groundingdino/layers.py      |  256 ++++
 paddlevlp/models/groundingdino/modeling.py    |  285 +++++
 .../models/groundingdino/ms_deform_attn.py    |  210 ++++
 paddlevlp/models/groundingdino/transformer.py |  970 +++++++++++++++
 .../groundingdino/transformer_vanilla.py      |  122 ++
 paddlevlp/models/groundingdino/utils.py       |  270 +++++
 paddlevlp/processors/__init__.py              |    1 +
 .../processors/groundingdino_processing.py    |  365 ++++++
 paddlevlp/processors/utils.py                 |    9 +
 26 files changed, 6680 insertions(+)
 create mode 100644 paddlevlp/examples/groundingdino/README.md
 create mode 100644 paddlevlp/examples/groundingdino/__init__.py
 create mode 100644 paddlevlp/examples/groundingdino/run_predict.py
 create mode 100644 paddlevlp/models/groundingdino/__init__.py
 create mode 100644 paddlevlp/models/groundingdino/backbone/__init__.py
 create mode 100644 paddlevlp/models/groundingdino/backbone/backbone.py
 create mode 100644 paddlevlp/models/groundingdino/backbone/position_encoding.py
 create mode 100644 paddlevlp/models/groundingdino/backbone/swin_transformer.py
 create mode 100644 paddlevlp/models/groundingdino/bert_model.py
 create mode 100644 paddlevlp/models/groundingdino/bertwarper.py
 create mode 100644 paddlevlp/models/groundingdino/configuration.py
 create mode 100644 paddlevlp/models/groundingdino/csrc/README.md
 create mode 100644 paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cc
 create mode 100644 paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cu
 create mode 100644 paddlevlp/models/groundingdino/csrc/setup_ms_deformable_attn_op.py
 create mode 100644 paddlevlp/models/groundingdino/csrc/test_ms_deformable_attn_op.py
 create mode 100644 paddlevlp/models/groundingdino/fuse_modules.py
 create mode 100644 paddlevlp/models/groundingdino/layers.py
 create mode 100644 paddlevlp/models/groundingdino/modeling.py
 create mode 100644 paddlevlp/models/groundingdino/ms_deform_attn.py
 create mode 100644 paddlevlp/models/groundingdino/transformer.py
 create mode 100644 paddlevlp/models/groundingdino/transformer_vanilla.py
 create mode 100644 paddlevlp/models/groundingdino/utils.py
 create mode 100644 paddlevlp/processors/groundingdino_processing.py

diff --git a/paddlevlp/examples/groundingdino/README.md b/paddlevlp/examples/groundingdino/README.md
new file mode 100644
index 00000000000000..d2a004578e15a7
--- /dev/null
+++ b/paddlevlp/examples/groundingdino/README.md
@@ -0,0 +1,25 @@
+# Grounding DINO
+
+## 1. 模型简介
+
+Paddle implementation of [Grounding DINO](https://arxiv.org/abs/2303.05499), a stronger open-set object detector.
+
+
+## 2. Demo
+
+## 2.1 prepare
+```bash
+#Multi-scale deformable attention custom OP compilation
+cd /paddlevlp/models/groundingdino/csrc/
+python setup_ms_deformable_attn_op.py install
+
+```
+## 2.2 dynamic inference
+```bash
+python3.8 run_predict.py -dt groundingdino-swint-ogc 
+-i image_you_want_to_detect.jpg \
+-o "dir you want to save the output" \
+-t "Detect Cat"
+```
+
+
diff --git a/paddlevlp/examples/groundingdino/__init__.py b/paddlevlp/examples/groundingdino/__init__.py
new file mode 100644
index 00000000000000..595add0aed9e11
--- /dev/null
+++ b/paddlevlp/examples/groundingdino/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlevlp/examples/groundingdino/run_predict.py b/paddlevlp/examples/groundingdino/run_predict.py
new file mode 100644
index 00000000000000..f461caac41cf3d
--- /dev/null
+++ b/paddlevlp/examples/groundingdino/run_predict.py
@@ -0,0 +1,124 @@
+import argparse
+import os
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from paddlevlp.processors.groundingdino_processing import GroudingDinoProcessor
+from paddlevlp.models.groundingdino.modeling import GroundingDinoModel
+from PIL import Image, ImageDraw, ImageFont
+
+
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * paddle.to_tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box.numpy()
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+
+    return image_pil, mask
+
+def main():
+    parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
+    parser.add_argument("--dino_type", "-dt", type=str, default="groundingdino-swint-ogc", help="dino type")
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument(
+        "--output_dir", "-o", type=str, default="outputs", help="output directory"
+    )
+
+    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument(
+        "--visual",
+        type=eval,
+        default=True,
+    )
+    
+
+    args = parser.parse_args()
+
+
+    #bulid processor
+    processor = GroudingDinoProcessor.from_pretrained(
+        'bert-base-uncased'
+    ) 
+    #bulid model
+    print(f'dino_model {args.dino_type}')
+    dino_model = GroundingDinoModel.from_pretrained(args.dino_type)
+
+    #read image
+    image_pil = Image.open(args.image_path).convert("RGB")
+    #preprocess image text_prompt
+    image_tensor,mask,tokenized_out = processor(images=image_pil,text=args.text_prompt)
+
+    with paddle.no_grad():
+        outputs = dino_model(image_tensor,mask, input_ids=tokenized_out['input_ids'],
+                        attention_mask=tokenized_out['attention_mask'],text_self_attention_masks=tokenized_out['text_self_attention_masks'],
+                        position_ids=tokenized_out['position_ids'])
+
+    logits = F.sigmoid(outputs["pred_logits"])[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+
+     # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(axis=1) > args.box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+
+     # build pred
+    pred_phrases = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = processor.decode(logit > args.text_threshold)
+        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+
+   
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    print("output:",pred_dict)
+
+    if args.visual:
+        # make dir
+        os.makedirs(args.output_dir, exist_ok=True)
+        image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+        image_with_box.save(os.path.join(args.output_dir, "pred.jpg"))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/paddlevlp/models/groundingdino/__init__.py b/paddlevlp/models/groundingdino/__init__.py
new file mode 100644
index 00000000000000..d1ff79f33aafb8
--- /dev/null
+++ b/paddlevlp/models/groundingdino/__init__.py
@@ -0,0 +1,14 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
diff --git a/paddlevlp/models/groundingdino/backbone/__init__.py b/paddlevlp/models/groundingdino/backbone/__init__.py
new file mode 100644
index 00000000000000..76e4b272b479a2
--- /dev/null
+++ b/paddlevlp/models/groundingdino/backbone/__init__.py
@@ -0,0 +1 @@
+from .backbone import build_backbone
diff --git a/paddlevlp/models/groundingdino/backbone/backbone.py b/paddlevlp/models/groundingdino/backbone/backbone.py
new file mode 100644
index 00000000000000..397a1fc36b234f
--- /dev/null
+++ b/paddlevlp/models/groundingdino/backbone/backbone.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Backbone modules.
+"""
+
+from typing import Dict, List, Optional
+from collections import OrderedDict
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+from .position_encoding import build_position_encoding
+from .swin_transformer import SwinTransformerModel
+
+
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+
+    def forward(self, x:paddle.Tensor,m:paddle.take):
+        xs,masks = self[0](x,m)
+        pos = []
+        for mask in masks:
+            pos.append(self[1](mask).astype(x.dtype))
+        return xs, masks,pos
+
+
+def build_backbone(args):
+    """
+    Useful args:
+        - backbone: backbone name
+        - lr_backbone:
+        - dilation
+        - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
+        - backbone_freeze_keywords:
+        - use_checkpoint: for swin only for now
+
+    """
+    position_embedding = build_position_encoding(args)
+    train_backbone = True
+    if not train_backbone:
+        raise ValueError("Please set lr_backbone > 0")
+    return_interm_indices = args.return_interm_indices
+    assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+    args.backbone_freeze_keywords
+    use_checkpoint = getattr(args, "use_checkpoint", False)
+
+    if args.backbone in [
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
+    ]:
+        pretrain_img_size = int(args.backbone.split("_")[-2])
+        backbone = SwinTransformerModel.from_pretrained(
+            args.backbone,
+            pretrain_img_size=pretrain_img_size,
+            out_indices=tuple(return_interm_indices),
+            dilation=False,
+            use_checkpoint=use_checkpoint,
+        )
+
+        bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
+    else:
+        raise NotImplementedError("Unknown backbone {}".format(args.backbone))
+
+    assert len(bb_num_channels) == len(
+        return_interm_indices
+    ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
+
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = bb_num_channels
+    assert isinstance(
+        bb_num_channels, List
+    ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
+
+    return model
diff --git a/paddlevlp/models/groundingdino/backbone/position_encoding.py b/paddlevlp/models/groundingdino/backbone/position_encoding.py
new file mode 100644
index 00000000000000..821b0fcc161a6b
--- /dev/null
+++ b/paddlevlp/models/groundingdino/backbone/position_encoding.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Various positional encodings for the transformer.
+"""
+import math
+from matplotlib.pyplot import axis
+
+import paddle
+import paddle.nn as nn
+from paddlenlp.utils.initializer import uniform_
+
+
+
+class PositionEmbeddingSine(nn.Layer):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, mask:paddle.Tensor):
+   
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.astype(paddle.float32).cumsum(1)
+        x_embed = not_mask.astype(paddle.float32).cumsum(2)
+        if self.normalize:
+            eps = 1e-6
+            # if os.environ.get("SHILONG_AMP", None) == '1':
+            #     eps = 1e-4
+            # else:
+            #     eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = 2 * (paddle.arange(self.num_pos_feats) // 2).astype(paddle.float32x)
+        dim_t = self.temperature ** (dim_t / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = paddle.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4
+        ).flatten(3)
+        pos_y = paddle.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4
+        ).flatten(3)
+        pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
+        return pos
+
+
+class PositionEmbeddingSineHW(nn.Layer):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(
+        self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None
+    ):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperatureH = temperatureH
+        self.temperatureW = temperatureW
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, mask:paddle.Tensor):
+
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.astype(paddle.float32).cumsum(1)
+        x_embed = not_mask.astype(paddle.float32).cumsum(2)
+
+        # import ipdb; ipdb.set_trace()
+
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_tx = paddle.arange(self.num_pos_feats)
+        dim_tx = self.temperatureW ** (2 * (paddle.floor_divide(dim_tx, paddle.to_tensor(2))) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+
+        dim_ty = paddle.arange(self.num_pos_feats)
+        dim_ty = self.temperatureH ** (2 * (paddle.floor_divide(dim_ty, paddle.to_tensor(2))) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+
+        pos_x = paddle.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4
+        ).flatten(3)
+        pos_y = paddle.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4
+        ).flatten(3)
+        pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
+
+        # import ipdb; ipdb.set_trace()
+
+        return pos
+
+
+class PositionEmbeddingLearned(nn.Layer):
+    """
+    Absolute pos embedding, learned.
+    """
+
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        uniform_(self.row_embed.weight)
+        uniform_(self.col_embed.weight)
+
+    def forward(self, x: paddle.Tensor):
+    
+        h, w = x.shape[-2:]
+        i = paddle.arange(w)
+        j = paddle.arange(h)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = (
+            paddle.concat(
+                [
+                    x_emb.unsqueeze(0).tile([h, 1, 1]),
+                    y_emb.unsqueeze(1).tile([1, w, 1]),
+                ],
+                axis=-1,
+            )
+            .transpose([2, 0, 1])
+            .unsqueeze(0)
+            .tile([x.shape[0], 1, 1, 1])
+        )
+        return pos
+
+
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ("v2", "sine"):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSineHW(
+            N_steps,
+            temperatureH=args.pe_temperatureH,
+            temperatureW=args.pe_temperatureW,
+            normalize=True,
+        )
+    elif args.position_embedding in ("v3", "learned"):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+
+    return position_embedding
diff --git a/paddlevlp/models/groundingdino/backbone/swin_transformer.py b/paddlevlp/models/groundingdino/backbone/swin_transformer.py
new file mode 100644
index 00000000000000..cd636f1b7965d6
--- /dev/null
+++ b/paddlevlp/models/groundingdino/backbone/swin_transformer.py
@@ -0,0 +1,897 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn.initializer import Constant
+from ..layers import DropPath, to_2tuple
+trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02)
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+
+""" swin_transformer model configuration"""
+__all__ = ["SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION", "SwinTransformerConfig", "SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP"]
+
+
+SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION = {
+    "swin_T_224_1k": {
+        "in_chans": 3,
+        "embed_dim": 96, 
+        "depths": [2, 2, 6, 2], 
+        "num_heads": [3, 6, 12, 24],
+        "window_size": 7,
+        "pretrain_img_size": 224,
+        "patch_size": 4,
+        "out_indices": (0, 1, 2, 3),
+        "mlp_ratio": 4.0,
+        "qkv_bias": True,
+        "qk_scale": None,
+        "drop_rate": 0.0,
+        "attn_drop_rate": 0.0,
+        "drop_path_rate": 0.2,
+        "norm_layer": "LayerNorm",
+        "ape": False,
+        "patch_norm": True,
+        "frozen_stages": -1,
+        "dilation": False,
+        "use_checkpoint": False,
+
+
+    },
+    "swin_B_224_22k": {
+        "in_chans": 3,
+        "embed_dim": 128, 
+        "depths": [2, 2, 18, 2], 
+        "num_heads": [4, 8, 16, 32], 
+        "window_size": 7,
+        "pretrain_img_size": 224,
+        "patch_size": 4,
+        "out_indices": (0, 1, 2, 3),
+        "mlp_ratio": 4.0,
+        "qkv_bias": True,
+        "qk_scale": None,
+        "drop_rate": 0.0,
+        "attn_drop_rate": 0.0,
+        "drop_path_rate": 0.2,
+        "norm_layer": "LayerNorm",
+        "ape": False,
+        "patch_norm": True,
+        "frozen_stages": -1,
+        "dilation": False,
+        "use_checkpoint": False
+    },
+    "swin_B_384_22k": {
+        "in_chans": 3,
+        "embed_dim": 128, 
+        "depths": [2, 2, 18, 2], 
+        "num_heads": [4, 8, 16, 32], 
+        "window_size": 12,
+        "pretrain_img_size": 384,
+        "patch_size": 4,
+        "out_indices": (0, 1, 2, 3),
+        "mlp_ratio": 4.0,
+        "qkv_bias": True,
+        "qk_scale": None,
+        "drop_rate": 0.0,
+        "attn_drop_rate": 0.0,
+        "drop_path_rate": 0.2,
+        "norm_layer": "LayerNorm",
+        "ape": False,
+        "patch_norm": True,
+        "frozen_stages": -1,
+        "dilation": False,
+        "use_checkpoint":False
+    },
+    "swin_L_224_22k": {
+        "in_chans": 3,
+        "embed_dim": 192, 
+        "depths": [2, 2, 18, 2], 
+        "num_heads": [6, 12, 24, 48], 
+        "window_size": 7,
+        "pretrain_img_size": 224,
+        "patch_size": 4,
+        "out_indices": (0, 1, 2, 3),
+        "mlp_ratio": 4.0,
+        "qkv_bias": True,
+        "qk_scale": None,
+        "drop_rate": 0.0,
+        "attn_drop_rate": 0.0,
+        "drop_path_rate": 0.2,
+        "norm_layer": "LayerNorm",
+        "ape": False,
+        "patch_norm": True,
+        "frozen_stages": -1,
+        "dilation": False,
+        "use_checkpoint": False
+    },
+    "swin_L_384_22k":{
+        "in_chans": 3,
+        "embed_dim": 192, 
+        "depths": [2, 2, 18, 2],
+        "num_heads": [6, 12, 24, 48], 
+        "window_size": 12,
+        "pretrain_img_size": 384,
+        "patch_size": 4,
+        "out_indices": (0, 1, 2, 3),
+        "mlp_ratio": 4.0,
+        "qkv_bias": True,
+        "qk_scale": None,
+        "drop_rate": 0.0,
+        "attn_drop_rate": 0.0,
+        "drop_path_rate": 0.2,
+        "norm_layer": "LayerNorm",
+        "ape": False,
+        "patch_norm": True,
+        "frozen_stages": -1,
+        "dilation": False,
+        "use_checkpoint": False
+    },
+    
+}
+
+SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "swin_T_224_1k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams",
+        "swin_B_224_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams",
+        "swin_B_384_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams",
+        "swin_L_224_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams",
+        "swin_L_384_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams",
+    }
+}
+
+
+class SwinTransformerConfig(PretrainedConfig):
+  
+    model_type = "swintransformer"
+    pretrained_init_configuration = SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        in_chans=3,
+        embed_dim=128, 
+        depths=[2, 2, 18, 2], 
+        num_heads=[4, 8, 16, 32], 
+        window_size=7,
+        pretrain_img_size=224,
+        patch_size=4,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        dilation=False,
+        use_checkpoint=False
+    ):
+        super().__init__()
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.pretrain_img_size = pretrain_img_size
+        self.patch_size = 4
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.norm_layer = norm_layer
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.dilation = dilation
+        self.use_checkpoint = use_checkpoint
+
+class SwinTransformerPretrainedModel(PretrainedModel):
+    """
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = "config.json"
+    config_class = SwinTransformerConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "swintransformer"
+
+    pretrained_init_configuration = SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP
+
+
+class Mlp(nn.Layer):
+    """Multilayer perceptron."""
+
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape([B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape([B, H // window_size, W // window_size, window_size, window_size, -1])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = self.create_parameter(
+            shape=[(2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads],
+            dtype=paddle.float32,
+            default_initializer=Constant(0.)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose([1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape([B_, N, 3, self.num_heads, C // self.num_heads])
+            .transpose([2, 0, 3, 1, 4])
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+        index = self.relative_position_index.flatten()
+      
+        relative_position_bias = paddle.index_select(
+            self.relative_position_bias_table, index)
+        
+        relative_position_bias = relative_position_bias.reshape([
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([-1, nW, self.num_heads, N, N]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([B, H, W, C])
+   
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        pad_list = paddle.zeros([4],dtype="int32")
+        pad_list[1] = pad_r
+        pad_list[3] = pad_b
+        x = F.pad(x,pad_list,data_format='NHWC')
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [-1, self.window_size * self.window_size, C]
+        )  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape([-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(shifted_x, shifts=(self.shift_size, self.shift_size), axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([B, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.reshape([B, H, W, C])
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            pad_list = paddle.zeros([4],dtype="int32")
+            pad_list[1] = H % 2
+            pad_list[3] = W % 2
+            x = F.pad(x, pad_list)
+         
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.reshape([B, -1, 4 * C])  # B H/2*W/2 4*C
+
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        
+        # calculate attention mask for SW-MSA
+        Hp = paddle.ceil(paddle.to_tensor(H / self.window_size)).astype("int32") * self.window_size
+        Wp = paddle.ceil(paddle.to_tensor(W / self.window_size)).astype("int32") * self.window_size
+        img_mask = paddle.zeros((1,Hp,Wp,1), dtype=paddle.float32)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.reshape([-1, self.window_size * self.window_size])
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = -100.0 * paddle.ones_like(attn_mask) * (attn_mask != 0).astype(paddle.float32)
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = recompute(blk, x, attn_mask, **{"preserve_rng_state": True})
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.shape
+        if W % self.patch_size[1] != 0:
+            pad_list = paddle.zeros([4],dtype="int32")
+            pad_list[1] = self.patch_size[1] - W % self.patch_size[1]
+            x = F.pad(x, pad_list)
+        if H % self.patch_size[0] != 0:
+            pad_list = paddle.zeros([4],dtype="int32")
+            pad_list[3] = self.patch_size[0] - H % self.patch_size[0]
+            x = F.pad(x, pad_list)
+        
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.shape[2:]
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+@register_base_model
+class SwinTransformerModel(SwinTransformerPretrainedModel):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    """
+
+    def __init__(self,config: SwinTransformerConfig):
+        super(SwinTransformerModel,self).__init__(config)
+      
+        self.pretrain_img_size = config.pretrain_img_size
+        self.num_layers = len(config.depths)
+        self.in_chans = config.in_chans
+        self.embed_dim = config.embed_dim
+        self.ape = config.ape
+        self.patch_norm = config.patch_norm
+        self.norm_layer = nn.LayerNorm 
+        self.out_indices = config.out_indices
+        self.frozen_stages = config.frozen_stages
+        self.dilation = config.dilation
+        self.patch_size = config.patch_size
+        self.patch_norm = config.patch_norm
+        self.drop_path_rate = config.drop_path_rate
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=self.patch_size,
+            in_chans=self.in_chans,
+            embed_dim=self.embed_dim,
+            norm_layer=self.norm_layer if self.patch_norm else None,
+        )
+
+        # absolute position embedding
+        if self.ape:
+            patch_size = to_2tuple(self.patch_size)
+            patches_resolution = [
+                self.pretrain_img_size[0] // self.patch_size[0],
+                self.pretrain_img_size[1] // self.patch_size[1],
+            ]
+
+            self.absolute_pos_embed = self.create_parameter(
+                shape=[1, self.embed_dim, patches_resolution[0], patches_resolution[1]],
+                dtype=paddle.float32,
+                default_initializer=Constant(0.)
+            )
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=config.drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in paddle.linspace(0, config.drop_path_rate, sum(config.depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        # prepare downsample list
+        downsamplelist = [PatchMerging for i in range(self.num_layers)]
+        downsamplelist[-1] = None
+        num_features = [int(self.embed_dim * 2**i) for i in range(self.num_layers)]
+        if self.dilation:
+            downsamplelist[-2] = None
+            num_features[-1] = int(self.embed_dim * 2 ** (self.num_layers - 1)) // 2
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=num_features[i_layer],
+                depth=config.depths[i_layer],
+                num_heads=config.num_heads[i_layer],
+                window_size=config.window_size,
+                mlp_ratio=config.mlp_ratio,
+                qkv_bias=config.qkv_bias,
+                qk_scale=config.qk_scale,
+                drop=config.drop_rate,
+                attn_drop=config.attn_drop_rate,
+                drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                norm_layer=self.norm_layer,
+                downsample=downsamplelist[i_layer],
+                use_checkpoint=config.use_checkpoint,
+            )
+            self.layers.append(layer)
+
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in self.out_indices:
+            layer = self.norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_sublayer(layer_name, layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.stop_gradient = Trueƒ
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+
+    def forward_raw(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.shape[2:4]
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+     
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+
+                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose((0, 3, 1, 2))
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # outs:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        return tuple(outs)
+
+    def forward_with_mask(self, x:paddle.Tensor, m:paddle.Tensor):
+
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.shape[2], x.shape[3]
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+
+                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose((0, 3, 1, 2))
+                outs.append(out)
+
+        feat_dict = []
+        mask_dict = []
+        for idx, out_i in enumerate(outs):
+            assert m is not None
+            mask = F.interpolate(m[None].cast(paddle.float32), size=out_i.shape[-2:]).cast(paddle.bool)[0]
+            feat_dict.append(out_i)
+            mask_dict.append(mask)
+
+        return feat_dict,mask_dict
+    
+    def forward(self, x:paddle.Tensor, m=None):
+        if m is not None:
+            return self.forward_with_mask(x,m)
+        else:
+            return self.forward_raw(x)
+
+ 
diff --git a/paddlevlp/models/groundingdino/bert_model.py b/paddlevlp/models/groundingdino/bert_model.py
new file mode 100644
index 00000000000000..e0cbf877fba3a9
--- /dev/null
+++ b/paddlevlp/models/groundingdino/bert_model.py
@@ -0,0 +1,715 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlenlp.taskflow.utils import pad_batch_data
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import math
+import numpy as np
+import warnings
+from paddlenlp.transformers.bert.modeling import BaseModelOutputWithPoolingAndCrossAttentions
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        self.act = nn.functional.gelu
+
+    def forward(self, input):
+        return self.act(input)
+
+
+class BertSelfAttention(nn.Layer):
+    def __init__(self, config, clamp_min_for_underflow=False, clamp_max_for_overflow=False):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # 'absolute'
+        self.clamp_min_for_underflow = clamp_min_for_underflow
+        self.clamp_max_for_overflow = clamp_max_for_overflow
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = tuple(x.shape[:-1]) + (self.num_attention_heads, self.attention_head_size)
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        
+        mixed_query_layer = self.query(hidden_states)
+        
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:  # here
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        # return query_layer,key_layer
+        if self.is_decoder: # False
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+       
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2]))
+        # return attention_scores
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if self.clamp_min_for_underflow:
+            attention_scores = paddle.clip(attention_scores, min=-50000) # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attention_scores = paddle.clip(attention_scores, max=50000) # Do not increase 50000, data type half has quite limited range
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        # if math.isnan(attention_probs.sum().item()):
+        #     for i in range(attention_probs.size(1)):
+        #         for j in range(attention_probs.size(2)):
+        #             if math.isnan(attention_probs[0, i, j].sum().item()):
+        #                 print(i, j)
+        #                 pdb.set_trace()
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        
+        context_layer = paddle.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = tuple(context_layer.shape[:-2]) + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)  # diff 7.2274e-06
+        hidden_states = self.dropout(hidden_states) # diff 4.22e-05  
+        #                             hidden_states + input_tensor diff : 7.22e-6
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)  #diff 1.087e-05
+        return hidden_states
+
+
+class BertAttention(nn.Layer):
+    def __init__(self, config, clamp_min_for_underflow=False, clamp_max_for_overflow=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, clamp_min_for_underflow, clamp_max_for_overflow)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )  #pass    
+        # return self_outputs
+        attention_output = self.output(self_outputs[0], hidden_states)
+        # print(attention_output.shape, self_outputs[0].shape, len(self_outputs))
+        # attention_output 1.087e-05,  self_outputs 1.31e-06 , hidden_states 1.33e-08
+        # return attention_output, self_outputs, hidden_states
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = GELUActivation()
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+class BertEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", paddle.arange(config.max_position_embeddings).reshape((1, -1)))
+        self.register_buffer(
+            "token_type_ids", paddle.zeros(self.position_ids.shape, dtype=paddle.int64), persistable=False
+        )
+
+    def forward(
+        self,
+        input_ids = None,
+        token_type_ids = None,
+        position_ids = None,
+        inputs_embeds = None,
+        past_key_values_length = 0,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand([input_shape[0], seq_length])
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+            # return inputs_embeds
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+class BertLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BertAttention(config, position_embedding_type="absolute")
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask = None,
+        head_mask = None,
+        encoder_hidden_states = None,
+        encoder_attention_mask = None,
+        past_key_value = None,
+        output_attentions = False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        # return self_attention_outputs
+        attention_output = self_attention_outputs[0]
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = self.feed_forward_chunk(attention_output)
+        # return layer_output, attention_output
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+class BertEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask = None,
+        head_mask = None,
+        encoder_hidden_states = None,
+        encoder_attention_mask = None,
+        past_key_values = None,
+        use_cache = None,
+        output_attentions = False,
+        output_hidden_states = False,
+        return_dict = True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+            )
+            # return layer_outputs
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=None,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class BertPooler(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertModel(nn.Layer):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        # self.post_init() 
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self, attention_mask: paddle.Tensor, input_shape: Tuple[int], device: str = None, dtype: np.float = None
+    ) -> paddle.Tensor:
+        if dtype is None:
+            dtype = np.float32
+
+        if not (attention_mask.dim() == 2 and self.config.is_decoder):
+            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
+            if device is not None:
+                warnings.warn(
+                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+                )
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and the dtype's smallest value for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = paddle.cast(extended_attention_mask, dtype=dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * np.finfo(dtype).min
+        return extended_attention_mask
+
+    def get_head_mask(
+        self, head_mask, num_hidden_layers, is_attention_chunked = False
+    ):
+        head_mask = [None] * num_hidden_layers
+        return head_mask
+
+    def forward(
+        self,
+        input_ids = None,
+        attention_mask = None,
+        token_type_ids = None,
+        position_ids = None,
+        head_mask = None,
+        inputs_embeds = None,
+        encoder_hidden_states = None,
+        encoder_attention_mask = None,
+        past_key_values = None,
+        use_cache = None,
+        output_attentions = None,
+        output_hidden_states = None,
+        return_dict = None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand([batch_size, seq_length])
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        # return embedding_output
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # return encoder_outputs
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+class language_model(nn.Layer):
+    def __init__(self, cfg, bert_config):
+        super().__init__()
+        self.cfg = cfg
+        self.bert_name = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
+        print("LANGUAGE BACKBONE USE GRADIENT CHECKPOINTING: ", self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT)
+        bert_config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT
+        
+        # bert_config.attention_probs_dropout_prob = 0.0
+        # bert_config.hidden_dropout_prob = 0.0
+
+        self.model = BertModel(bert_config)
+        self.language_dim = 768
+        self.num_layers = cfg.MODEL.LANGUAGE_BACKBONE.N_LAYERS
+    
+    def forward(self, x):
+        input = x["input_ids"]
+        mask = x["attention_mask"]
+
+        if self.cfg.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS: # true
+            # with padding, always 256
+            outputs = self.model(
+                input_ids=input,
+                attention_mask=mask,
+                output_hidden_states=True,
+            )
+            # outputs has 13 layers, 1 input layer and 12 hidden layers
+            encoded_layers = outputs.hidden_states[1:]
+            features = None
+            features = paddle.stack(encoded_layers[-self.num_layers:], 1).mean(1)
+
+            # language embedding has shape [len(phrase), seq_len, language_dim]
+            features = features / self.num_layers
+
+            embedded = paddle.cast(features * mask.unsqueeze(-1), paddle.float32)
+            aggregate = embedded.sum(1) / (paddle.cast(mask.sum(-1).unsqueeze(-1),paddle.float32))
+
+        ret = {
+            "aggregate": aggregate,
+            "embedded": embedded,
+            "masks": mask,
+            "hidden": encoded_layers[-1]
+        }
+        return ret
\ No newline at end of file
diff --git a/paddlevlp/models/groundingdino/bertwarper.py b/paddlevlp/models/groundingdino/bertwarper.py
new file mode 100644
index 00000000000000..d4c75bccdbe339
--- /dev/null
+++ b/paddlevlp/models/groundingdino/bertwarper.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .bert_model import BertModel
+from paddlenlp.transformers.model_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+
+
+class BertModelWarper(nn.Layer):
+    def __init__(self, bert_model):
+        super().__init__()
+        bert_model = BertModel(bert_model.config)
+
+        self.config = bert_model.config
+        self.embeddings = bert_model.embeddings
+        self.encoder = bert_model.encoder
+        self.pooler = bert_model.pooler
+
+        self.get_extended_attention_mask = bert_model.get_extended_attention_mask
+        # self.invert_attention_mask = bert_model.invert_attention_mask
+        self.get_head_mask = bert_model.get_head_mask
+        self.use_return_dict = True
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(
+                ((batch_size, seq_length + past_key_values_length))
+            )
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: paddle.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        # if self.config.is_decoder and encoder_hidden_states is not None:
+        #     encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
+        #     encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+        #     if encoder_attention_mask is None:
+        #         encoder_attention_mask = paddle.ones(encoder_hidden_shape, device=device)
+        #     encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        # else:
+        #     encoder_extended_attention_mask = None
+        encoder_extended_attention_mask = None
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class TextEncoderShell(nn.Layer):
+    def __init__(self, text_encoder):
+        super().__init__()
+        self.text_encoder = text_encoder
+        self.config = self.text_encoder.config
+
+    def forward(self, **kw):
+        # feed into text encoder
+        return self.text_encoder(**kw)
+
+
+def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer):
+    """Generate attention mask between each pair of special tokens
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+    Returns:
+        torch.Tensor: attention mask between each special tokens.
+    """
+    input_ids = tokenized["input_ids"]
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = paddle.zeros((bs, num_token), dtype=paddle.bool)
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = paddle.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = (
+        paddle.eye(num_token, dtype=paddle.bool).unsqueeze(0).tile([bs, 1, 1])
+    )
+    position_ids = paddle.zeros((bs, num_token))
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col)
+
+        previous_col = col
+
+    # # padding mask
+    # padding_mask = tokenized['attention_mask']
+    # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+
+    return attention_mask, position_ids.cast(paddle.int64)
+
+
+def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
+    """Generate attention mask between each pair of special tokens
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+    Returns:
+        torch.Tensor: attention mask between each special tokens.
+    """
+    input_ids = tokenized["input_ids"]
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = paddle.zeros((bs, num_token), dtype=paddle.bool)
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = paddle.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = (
+        paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1])
+    )
+    position_ids = paddle.zeros((bs, num_token))
+    cate_to_token_mask_list = [[] for _ in range(bs)]
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col)
+            c2t_maski = paddle.zeros([num_token,]).cast(paddle.bool)
+            c2t_maski[previous_col + 1 : col] = True
+            cate_to_token_mask_list[row].append(c2t_maski)
+        previous_col = col
+
+    # cate_to_token_mask_list = [
+    #     paddle.stack(cate_to_token_mask_listi, axis=0)
+    #     for cate_to_token_mask_listi in cate_to_token_mask_list
+    # ]
+
+    # # padding mask
+    # padding_mask = tokenized['attention_mask']
+    # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+
+    return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list
diff --git a/paddlevlp/models/groundingdino/configuration.py b/paddlevlp/models/groundingdino/configuration.py
new file mode 100644
index 00000000000000..d39c42461b99d0
--- /dev/null
+++ b/paddlevlp/models/groundingdino/configuration.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" GroundingDino model configuration"""
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION", "GroundingDinoConfig", "GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP"]
+
+GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION = {
+    "groundingdino-swint-ogc": {
+        "modelname" : "groundingdino",
+        "backbone" : "swin_T_224_1k",
+        "position_embedding" : "sine",
+        "pe_temperatureH" : 20,
+        "pe_temperatureW" : 20,
+        "return_interm_indices" : [1, 2, 3],
+        "backbone_freeze_keywords" : None,
+        "enc_layers" : 6,
+        "dec_layers" : 6,
+        "pre_norm" : False,
+        "dim_feedforward" : 2048,
+        "hidden_dim" : 256,
+        "dropout" : 0.0,
+        "nheads" : 8,
+        "num_queries" : 900,
+        "query_dim" : 4,
+        "num_patterns" : 0,
+        "num_feature_levels" : 4,
+        "enc_n_points" : 4,
+        "dec_n_points" : 4,
+        "two_stage_type" : "standard",
+        "two_stage_bbox_embed_share" : False,
+        "two_stage_class_embed_share" : False,
+        "transformer_activation" : "relu",
+        "dec_pred_bbox_embed_share" : True,
+        "dn_box_noise_scale" : 1.0,
+        "dn_label_noise_ratio" : 0.5,
+        "dn_label_coef" : 1.0,
+        "dn_bbox_coef" : 1.0,
+        "embed_init_tgt" :True,
+        "dn_labelbook_size" : 2000,
+        "max_text_len" : 256,
+        "text_encoder_type" : "bert-base-uncased",
+        "use_text_enhancer" : True,
+        "use_fusion_layer" : True,
+        "use_checkpoint" : False,
+        "use_transformer_ckpt" : False,
+        "use_text_cross_attention" : True,
+        "text_dropout" : 0.0,
+        "fusion_dropout" : 0.0,
+        "fusion_droppath" : 0.1,
+        "sub_sentence_present" : True
+    },
+}
+
+GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "groundingdino-swint-ogc": "https://bj.bcebos.com/v1/paddledet/models/groundingdino_swint_ogc.pdparams",
+    }
+}
+
+
+class GroundingDinoConfig(PretrainedConfig):
+  
+    model_type = "groundingdino"
+    pretrained_init_configuration = GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        modelname = "groundingdino",
+        backbone = "swin_T_224_1k",
+        position_embedding = "sine",
+        pe_temperatureH = 20,
+        pe_temperatureW = 20,
+        return_interm_indices = [1, 2, 3],
+        backbone_freeze_keywords = None,
+        enc_layers = 6,
+        dec_layers = 6,
+        pre_norm = False,
+        dim_feedforward = 2048,
+        hidden_dim = 256,
+        dropout = 0.0,
+        nheads = 8,
+        num_queries = 900,
+        query_dim = 4,
+        num_patterns = 0,
+        num_feature_levels = 4,
+        enc_n_points = 4,
+        dec_n_points = 4,
+        two_stage_type = "standard",
+        two_stage_bbox_embed_share = False,
+        two_stage_class_embed_share = False,
+        transformer_activation = "relu",
+        dec_pred_bbox_embed_share = True,
+        dn_box_noise_scale = 1.0,
+        dn_label_noise_ratio = 0.5,
+        dn_label_coef = 1.0,
+        dn_bbox_coef = 1.0,
+        embed_init_tgt = True,
+        dn_labelbook_size = 2000,
+        max_text_len = 256,
+        text_encoder_type = "bert-base-uncased",
+        use_text_enhancer = True,
+        use_fusion_layer = True,
+        use_checkpoint = False,
+        use_transformer_ckpt = False,
+        use_text_cross_attention = True,
+        text_dropout = 0.0,
+        fusion_dropout = 0.0,
+        fusion_droppath = 0.1,
+        sub_sentence_present = True
+    ):
+        super().__init__()
+        self.modelname = modelname
+        self.backbone = backbone
+        self.position_embedding = position_embedding
+        self.pe_temperatureH = pe_temperatureH
+        self.pe_temperatureW = pe_temperatureW
+        self.return_interm_indices = return_interm_indices
+        self.backbone_freeze_keywords = backbone_freeze_keywords
+        self.enc_layers = enc_layers
+        self.dec_layers = dec_layers
+        self.pre_norm = pre_norm
+        self.dim_feedforward = dim_feedforward
+        self.hidden_dim = hidden_dim
+        self.dropout = dropout
+        self.nheads = nheads
+        self.num_queries = num_queries
+        self.query_dim = query_dim
+        self.num_patterns = num_patterns
+        self.num_feature_levels = num_feature_levels
+        self.enc_n_points = enc_n_points
+        self.dec_n_points = dec_n_points
+        self.two_stage_type = two_stage_type
+        self.two_stage_bbox_embed_share = two_stage_bbox_embed_share
+        self.two_stage_class_embed_share = two_stage_class_embed_share
+        self.transformer_activation = transformer_activation
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_label_coef = dn_label_coef
+        self.dn_bbox_coef = dn_bbox_coef
+        self.embed_init_tgt = embed_init_tgt
+        self.dn_labelbook_size = dn_labelbook_size
+        self.max_text_len = max_text_len
+        self.text_encoder_type = text_encoder_type
+        self.use_text_enhancer = use_text_enhancer
+        self.use_fusion_layer = use_fusion_layer
+        self.use_checkpoint = use_checkpoint
+        self.use_transformer_ckpt = use_transformer_ckpt
+        self.use_text_cross_attention = use_text_cross_attention
+        self.text_dropout = text_dropout
+        self.fusion_dropout = fusion_dropout
+        self.fusion_droppath = fusion_dropout
+        self.sub_sentence_present = sub_sentence_present
diff --git a/paddlevlp/models/groundingdino/csrc/README.md b/paddlevlp/models/groundingdino/csrc/README.md
new file mode 100644
index 00000000000000..290926d56a3ae2
--- /dev/null
+++ b/paddlevlp/models/groundingdino/csrc/README.md
@@ -0,0 +1,85 @@
+# Multi-scale deformable attention自定义OP编译
+该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
+
+## 1. 环境依赖
+- Paddle >= 2.3.2
+- gcc 8.2
+
+## 2. 安装
+请在当前路径下进行编译安装
+```
+cd PaddleDetection/ppdet/modeling/transformers/ext_op/
+python setup_ms_deformable_attn_op.py install
+```
+
+编译完成后即可使用，以下为`ms_deformable_attn`的使用示例
+```
+# 引入自定义op
+from deformable_detr_ops import ms_deformable_attn
+
+# 构造fake input tensor
+bs, n_heads, c = 2, 8, 8
+query_length, n_levels, n_points = 2, 2, 2
+spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
+level_start_index = paddle.concat((paddle.to_tensor(
+    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+value_length = sum([(H * W).item() for H, W in spatial_shapes])
+
+def get_test_tensors(channels):
+    value = paddle.rand(
+        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points, 2],
+        dtype=paddle.float32)
+    attention_weights = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points],
+        dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
+        -2, keepdim=True)
+    return [value, sampling_locations, attention_weights]
+
+value, sampling_locations, attention_weights = get_test_tensors(c)
+
+output = ms_deformable_attn(value,
+                            spatial_shapes,
+                            level_start_index,
+                            sampling_locations,
+                            attention_weights)
+```
+
+## 3. 单元测试
+可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
+```
+python test_ms_deformable_attn_op.py
+```
+运行成功后，打印如下：
+```
+*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
+*tensor1 True check_gradient_numerical(D=30)
+*tensor2 True check_gradient_numerical(D=30)
+*tensor3 True check_gradient_numerical(D=30)
+*tensor1 True check_gradient_numerical(D=32)
+*tensor2 True check_gradient_numerical(D=32)
+*tensor3 True check_gradient_numerical(D=32)
+*tensor1 True check_gradient_numerical(D=64)
+*tensor2 True check_gradient_numerical(D=64)
+*tensor3 True check_gradient_numerical(D=64)
+*tensor1 True check_gradient_numerical(D=71)
+*tensor2 True check_gradient_numerical(D=71)
+*tensor3 True check_gradient_numerical(D=71)
+*tensor1 True check_gradient_numerical(D=128)
+*tensor2 True check_gradient_numerical(D=128)
+*tensor3 True check_gradient_numerical(D=128)
+*tensor1 True check_gradient_numerical(D=1024)
+*tensor2 True check_gradient_numerical(D=1024)
+*tensor3 True check_gradient_numerical(D=1024)
+*tensor1 True check_gradient_numerical(D=1025)
+*tensor2 True check_gradient_numerical(D=1025)
+*tensor3 True check_gradient_numerical(D=1025)
+*tensor1 True check_gradient_numerical(D=2048)
+*tensor2 True check_gradient_numerical(D=2048)
+*tensor3 True check_gradient_numerical(D=2048)
+*tensor1 True check_gradient_numerical(D=3096)
+*tensor2 True check_gradient_numerical(D=3096)
+*tensor3 True check_gradient_numerical(D=3096)
+```
diff --git a/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cc b/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cc
new file mode 100644
index 00000000000000..d1758adbcd9951
--- /dev/null
+++ b/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+#include <vector>
+
+// declare GPU implementation
+std::vector<paddle::Tensor>
+MSDeformableAttnCUDAForward(const paddle::Tensor &value,
+                            const paddle::Tensor &value_spatial_shapes,
+                            const paddle::Tensor &value_level_start_index,
+                            const paddle::Tensor &sampling_locations,
+                            const paddle::Tensor &attention_weights);
+
+std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
+    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
+    const paddle::Tensor &value_level_start_index,
+    const paddle::Tensor &sampling_locations,
+    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
+
+//// CPU not implemented
+
+std::vector<std::vector<int64_t>>
+MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
+                           std::vector<int64_t> value_spatial_shapes_shape,
+                           std::vector<int64_t> value_level_start_index_shape,
+                           std::vector<int64_t> sampling_locations_shape,
+                           std::vector<int64_t> attention_weights_shape) {
+  return {{value_shape[0], sampling_locations_shape[1],
+           value_shape[2] * value_shape[3]}};
+}
+
+std::vector<paddle::DataType>
+MSDeformableAttnInferDtype(paddle::DataType value_dtype,
+                           paddle::DataType value_spatial_shapes_dtype,
+                           paddle::DataType value_level_start_index_dtype,
+                           paddle::DataType sampling_locations_dtype,
+                           paddle::DataType attention_weights_dtype) {
+  return {value_dtype};
+}
+
+PD_BUILD_OP(ms_deformable_attn)
+    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
+             "AttentionWeights"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
+
+PD_BUILD_GRAD_OP(ms_deformable_attn)
+    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
+             "AttentionWeights", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
+              paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
+              paddle::Grad("AttentionWeights")})
+    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
diff --git a/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cu b/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cu
new file mode 100644
index 00000000000000..d5a8d16181bb53
--- /dev/null
+++ b/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cu
@@ -0,0 +1,1073 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
+// forward bilinear
+template <typename data_t>
+__device__ data_t deformable_attn_bilinear_forward(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// forward kernel
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_forward(
+    const int n, const data_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const data_t *data_sampling_loc,
+    const data_t *data_attn_weight, const int batch_size,
+    const int value_length, const int num_heads, const int channels,
+    const int num_levels, const int query_length, const int num_points,
+    data_t *output_data_ptr) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    data_t *data_ptr = output_data_ptr + index;
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+    data_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset +
+                                                   level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += deformable_attn_bilinear_forward(
+                     data_value_ptr, spatial_h, spatial_w, num_heads, channels,
+                     h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_ptr = col;
+  }
+}
+
+#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+// forward
+std::vector<paddle::Tensor>
+MSDeformableAttnCUDAForward(const paddle::Tensor &value,
+                            const paddle::Tensor &value_spatial_shapes,
+                            const paddle::Tensor &value_level_start_index,
+                            const paddle::Tensor &sampling_locations,
+                            const paddle::Tensor &attention_weights) {
+
+  CHECK_INPUT_GPU(value);
+  CHECK_INPUT_GPU(value_spatial_shapes);
+  CHECK_INPUT_GPU(value_level_start_index);
+  CHECK_INPUT_GPU(sampling_locations);
+  CHECK_INPUT_GPU(attention_weights);
+
+  const int batch_size = value.shape()[0];
+  const int value_length = value.shape()[1];
+  const int num_heads = value.shape()[2];
+  const int channels = value.shape()[3];
+
+  const int num_levels = value_spatial_shapes.shape()[0];
+  const int query_length = sampling_locations.shape()[1];
+  const int num_points = sampling_locations.shape()[4];
+
+  auto output = paddle::full({batch_size, query_length, num_heads * channels},
+                             0, value.dtype(), paddle::GPUPlace());
+
+  const int num_kernels = batch_size * query_length * num_heads * channels;
+  deformable_attn_cuda_kernel_forward<float>
+      <<<GET_BLOCKS(num_kernels, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
+         value.stream()>>>(num_kernels, value.data<float>(),
+                           value_spatial_shapes.data<int64_t>(),
+                           value_level_start_index.data<int64_t>(),
+                           sampling_locations.data<float>(),
+                           attention_weights.data<float>(), batch_size,
+                           value_length, num_heads, channels, num_levels,
+                           query_length, num_points, output.data<float>());
+  return {output};
+}
+
+// backward bilinear
+template <typename data_t>
+__device__ void deformable_attn_bilinear_backward(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c, const data_t &top_grad,
+    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const data_t top_grad_value = top_grad * attn_weight;
+  data_t grad_h_weight = 0, grad_w_weight = 0;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename data_t>
+__device__ void deformable_attn_bilinear_backward_gm(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c, const data_t &top_grad,
+    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const data_t top_grad_value = top_grad * attn_weight;
+  data_t grad_h_weight = 0, grad_w_weight = 0;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+// backward kernels
+// channels > 1024
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_gm(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+// channels <= 1024
+template <typename data_t, unsigned int blockSize>
+__global__ void
+deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ data_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          data_t _grad_w = cache_grad_sampling_loc[0],
+                 _grad_h = cache_grad_sampling_loc[1],
+                 _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t, unsigned int blockSize>
+__global__ void
+deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ data_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          data_t _grad_w = cache_grad_sampling_loc[0],
+                 _grad_h = cache_grad_sampling_loc[1],
+                 _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+// backward branch
+template <typename data_t>
+void deformable_attn_cuda_backward(
+    cudaStream_t stream, const data_t *grad_out, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
+  const int num_kernels = batch_size * query_length * num_heads * channels;
+  const int num_actual_kernels =
+      batch_size * query_length * num_heads * channels;
+  if (channels > 1024) {
+    if ((channels & 1023) == 0) {
+      deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks<data_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(data_t), stream>>>(
+              num_kernels, grad_out, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, value_length, num_heads, channels, num_levels,
+              query_length, num_points, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      deformable_attn_cuda_kernel_backward_gm<data_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+    case 1:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         1>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 2:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         2>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 4:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         4>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 8:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         8>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 16:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         16>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 32:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         32>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 64:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         64>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 128:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         128>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 256:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         256>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 512:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         512>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 1024:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         1024>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    default:
+      if (channels < 64) {
+        deformable_attn_cuda_kernel_backward_shm_reduce_v1<data_t>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+               num_threads * 3 * sizeof(data_t), stream>>>(
+                num_kernels, grad_out, data_value, data_spatial_shapes,
+                data_level_start_index, data_sampling_loc, data_attn_weight,
+                batch_size, value_length, num_heads, channels, num_levels,
+                query_length, num_points, grad_value, grad_sampling_loc,
+                grad_attn_weight);
+      } else {
+        deformable_attn_cuda_kernel_backward_shm_reduce_v2<data_t>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+               num_threads * 3 * sizeof(data_t), stream>>>(
+                num_kernels, grad_out, data_value, data_spatial_shapes,
+                data_level_start_index, data_sampling_loc, data_attn_weight,
+                batch_size, value_length, num_heads, channels, num_levels,
+                query_length, num_points, grad_value, grad_sampling_loc,
+                grad_attn_weight);
+      }
+    }
+  }
+}
+
+// backward
+std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
+    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
+    const paddle::Tensor &value_level_start_index,
+    const paddle::Tensor &sampling_locations,
+    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) {
+
+  CHECK_INPUT_GPU(value);
+  CHECK_INPUT_GPU(value_spatial_shapes);
+  CHECK_INPUT_GPU(value_level_start_index);
+  CHECK_INPUT_GPU(sampling_locations);
+  CHECK_INPUT_GPU(attention_weights);
+  CHECK_INPUT_GPU(grad_out);
+
+  const int batch_size = value.shape()[0];
+  const int value_length = value.shape()[1];
+  const int num_heads = value.shape()[2];
+  const int channels = value.shape()[3];
+
+  const int num_levels = value_spatial_shapes.shape()[0];
+  const int query_length = sampling_locations.shape()[1];
+  const int num_points = sampling_locations.shape()[4];
+
+  auto grad_value =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_spatial_shapes =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_level_start_index =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_sampling_locations =
+      paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(),
+                   paddle::GPUPlace());
+  auto grad_attention_weights =
+      paddle::full(attention_weights.shape(), 0, attention_weights.dtype(),
+                   paddle::GPUPlace());
+
+  deformable_attn_cuda_backward<float>(
+      value.stream(), grad_out.data<float>(), value.data<float>(),
+      value_spatial_shapes.data<int64_t>(),
+      value_level_start_index.data<int64_t>(), sampling_locations.data<float>(),
+      attention_weights.data<float>(), batch_size, value_length, num_heads,
+      channels, num_levels, query_length, num_points, grad_value.data<float>(),
+      grad_sampling_locations.data<float>(),
+      grad_attention_weights.data<float>());
+
+  return {grad_value, grad_spatial_shapes, grad_level_start_index,
+          grad_sampling_locations, grad_attention_weights};
+}
diff --git a/paddlevlp/models/groundingdino/csrc/setup_ms_deformable_attn_op.py b/paddlevlp/models/groundingdino/csrc/setup_ms_deformable_attn_op.py
new file mode 100644
index 00000000000000..7c3c386677e5d5
--- /dev/null
+++ b/paddlevlp/models/groundingdino/csrc/setup_ms_deformable_attn_op.py
@@ -0,0 +1,7 @@
+from paddle.utils.cpp_extension import CUDAExtension, setup
+
+if __name__ == "__main__":
+    setup(
+        name='deformable_detr_ops',
+        ext_modules=CUDAExtension(
+            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
diff --git a/paddlevlp/models/groundingdino/csrc/test_ms_deformable_attn_op.py b/paddlevlp/models/groundingdino/csrc/test_ms_deformable_attn_op.py
new file mode 100644
index 00000000000000..94a05737cbcd6d
--- /dev/null
+++ b/paddlevlp/models/groundingdino/csrc/test_ms_deformable_attn_op.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import os
+import sys
+import random
+import numpy as np
+import paddle
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.modeling.transformers.utils import deformable_attention_core_func
+ms_deform_attn_core_paddle = deformable_attention_core_func
+
+try:
+    gpu_index = int(sys.argv[1])
+except:
+    gpu_index = 0
+print(f'Use gpu {gpu_index} to test...')
+paddle.set_device(f'gpu:{gpu_index}')
+
+try:
+    from deformable_detr_ops import ms_deformable_attn
+except Exception as e:
+    print('import deformable_detr_ops error', e)
+    sys.exit(-1)
+
+paddle.seed(1)
+random.seed(1)
+np.random.seed(1)
+
+bs, n_heads, c = 2, 8, 8
+query_length, n_levels, n_points = 2, 2, 2
+spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
+level_start_index = paddle.concat((paddle.to_tensor(
+    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+value_length = sum([(H * W).item() for H, W in spatial_shapes])
+
+
+def get_test_tensors(channels):
+    value = paddle.rand(
+        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points, 2],
+        dtype=paddle.float32)
+    attention_weights = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points],
+        dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
+        -2, keepdim=True)
+
+    return [value, sampling_locations, attention_weights]
+
+
+@paddle.no_grad()
+def check_forward_equal_with_paddle_float():
+    value, sampling_locations, attention_weights = get_test_tensors(c)
+
+    output_paddle = ms_deform_attn_core_paddle(
+        value, spatial_shapes, level_start_index, sampling_locations,
+        attention_weights).detach().cpu()
+    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
+                                     sampling_locations,
+                                     attention_weights).detach().cpu()
+    fwdok = paddle.allclose(
+        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
+    max_abs_err = (output_cuda - output_paddle).abs().max().item()
+    max_rel_err = (
+        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
+
+    print(
+        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
+    )
+
+
+def check_gradient_numerical(channels=4):
+    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
+        channels)
+    value_paddle.stop_gradient = False
+    sampling_locations_paddle.stop_gradient = False
+    attention_weights_paddle.stop_gradient = False
+
+    value_cuda = value_paddle.detach().clone()
+    sampling_locations_cuda = sampling_locations_paddle.detach().clone()
+    attention_weights_cuda = attention_weights_paddle.detach().clone()
+    value_cuda.stop_gradient = False
+    sampling_locations_cuda.stop_gradient = False
+    attention_weights_cuda.stop_gradient = False
+
+    output_paddle = ms_deform_attn_core_paddle(
+        value_paddle, spatial_shapes, level_start_index,
+        sampling_locations_paddle, attention_weights_paddle)
+    output_paddle.sum().backward()
+
+    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
+                                     level_start_index, sampling_locations_cuda,
+                                     attention_weights_cuda)
+    output_cuda.sum().backward()
+
+    res = paddle.allclose(
+        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
+    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
+
+    res = paddle.allclose(
+        sampling_locations_paddle.grad,
+        sampling_locations_cuda.grad,
+        rtol=1e-2,
+        atol=1e-3).item()
+    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
+
+    res = paddle.allclose(
+        attention_weights_paddle.grad,
+        attention_weights_cuda.grad,
+        rtol=1e-2,
+        atol=1e-3).item()
+    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_paddle_float()
+
+    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
+        check_gradient_numerical(channels)
diff --git a/paddlevlp/models/groundingdino/fuse_modules.py b/paddlevlp/models/groundingdino/fuse_modules.py
new file mode 100644
index 00000000000000..0dc731cfa66e7d
--- /dev/null
+++ b/paddlevlp/models/groundingdino/fuse_modules.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+from .layers import DropPath
+from paddlenlp.utils.initializer import constant_,xavier_uniform_
+from .utils import masked_fill
+
+
+class FeatureResizer(nn.Layer):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+
+
+def l1norm(X, dim, eps=1e-8):
+    """L1-normalize columns of X"""
+    norm = paddle.abs(X).sum(axis=dim, keepdim=True) + eps
+    X = paddle.divide(X, norm)
+    return X
+
+
+def l2norm(X, dim, eps=1e-8):
+    """L2-normalize columns of X"""
+    norm = paddle.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
+    X = paddle.divide(X, norm)
+    return X
+
+
+def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
+    """
+    query: (n_context, queryL, d)
+    context: (n_context, sourceL, d)
+    """
+    batch_size_q, queryL = query.shape[:2]
+    batch_size, sourceL = context.shape[:2]
+
+    # Get attention
+    # --> (batch, d, queryL)
+    queryT = query.transpose([0, 2, 1])
+
+    # (batch, sourceL, d)(batch, d, queryL)
+    # --> (batch, sourceL, queryL)
+    attn = paddle.bmm(context, queryT)
+    if raw_feature_norm == "softmax":
+        # --> (batch*sourceL, queryL)
+        attn = attn.reshape([batch_size * sourceL, queryL])
+        attn = nn.Softmax()(attn)
+        # --> (batch, sourceL, queryL)
+        attn = attn.reshape(batch_size, sourceL, queryL)
+    elif raw_feature_norm == "l2norm":
+        attn = l2norm(attn, 2)
+    elif raw_feature_norm == "clipped_l2norm":
+        attn = nn.LeakyReLU(0.1)(attn)
+        attn = l2norm(attn, 2)
+    else:
+        raise ValueError("unknown first norm type:", raw_feature_norm)
+    # --> (batch, queryL, sourceL)
+    attn = attn.transpose([0, 2, 1])
+    # --> (batch*queryL, sourceL)
+    attn = attn.reshape([batch_size * queryL, sourceL])
+    attn = nn.Softmax()(attn * smooth)
+    # --> (batch, queryL, sourceL)
+    attn = attn.reshape([batch_size, queryL, sourceL])
+    # --> (batch, sourceL, queryL)
+    attnT = attn.transpose([0, 2, 1])
+
+    # --> (batch, d, sourceL)
+    contextT = context.transpose([0, 2, 1])
+    # (batch x d x sourceL)(batch x sourceL x queryL)
+    # --> (batch, d, queryL)
+    weightedContext = paddle.bmm(contextT, attnT)
+    # --> (batch, queryL, d)
+    weightedContext = weightedContext.transpose([0, 2, 1])
+
+    return weightedContext, attnT
+
+
+class BiMultiHeadAttention(nn.Layer):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
+        super(BiMultiHeadAttention, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+
+        assert (
+            self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+
+        self.stable_softmax_2d = True
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor, seq_len, bsz):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def _reset_parameters(self):
+        xavier_uniform_(self.v_proj.weight)
+        constant_(self.v_proj.bias)
+        xavier_uniform_(self.l_proj.weight)
+        constant_(self.l_proj.bias)
+        xavier_uniform_(self.values_v_proj.weight)
+        constant_(self.values_v_proj.bias)
+        xavier_uniform_(self.values_l_proj.weight)
+        constant_(self.values_l_proj.bias)
+        xavier_uniform_(self.out_v_proj.weight)
+        constant_(self.out_v_proj.bias)
+        xavier_uniform_(self.out_l_proj.weight)
+        constant_(self.out_l_proj.bias)
+
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        """_summary_
+
+        Args:
+            v (_type_): bs, n_img, dim
+            l (_type_): bs, n_text, dim
+            attention_mask_v (_type_, optional): _description_. bs, n_img
+            attention_mask_l (_type_, optional): _description_. bs, n_text
+
+        Returns:
+            _type_: _description_
+        """
+  
+        bsz, tgt_len, _ = v.shape
+
+        query_states = self.v_proj(v) * self.scale
+        key_states = self._shape(self.l_proj(l), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
+        key_states = key_states.reshape(proj_shape)
+        value_v_states = value_v_states.reshape(proj_shape)
+        value_l_states = value_l_states.reshape(proj_shape)
+
+        src_len = key_states.shape[1]
+        attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1]))  # bs*nhead, nimg, ntxt
+
+        if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.shape}"
+            )
+
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+
+        if self.clamp_min_for_underflow:
+            attn_weights = paddle.clip(
+                attn_weights, min=-50000
+            )  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights = paddle.clip(
+                attn_weights, max=50000
+            )  # Do not increase 50000, data type half has quite limited range
+
+        attn_weights_T = attn_weights.transpose([0, 2, 1])
+        attn_weights_l = attn_weights_T - paddle.max(attn_weights_T, axis=-1, keepdim=True)
+        if self.clamp_min_for_underflow:
+            attn_weights_l = paddle.clip(
+                attn_weights_l, min=-50000
+            )  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights_l = paddle.clip(
+                attn_weights_l, max=50000
+            )  # Do not increase 50000, data type half has quite limited range
+
+        # mask vison for language
+        if attention_mask_v is not None:
+            
+            attention_mask_v = (
+                attention_mask_v[:, None, None, :].cast(paddle.float32).tile([1, self.num_heads, 1, 1]).flatten(0, 1)
+            )
+            attn_weights_l = masked_fill(attn_weights_l, attention_mask_v == 1., float("-inf"))
+
+        attn_weights_l = F.softmax(attn_weights_l, axis=-1)
+
+        # mask language for vision
+        if attention_mask_l is not None:
+            attention_mask_l = (
+                attention_mask_l[:, None, None, :].cast(paddle.float32).tile([1, self.num_heads, 1, 1]).flatten(0, 1)
+            )
+            attn_weights = masked_fill(attn_weights, attention_mask_l == 1., float("-inf"))
+        
+        attn_weights_v = F.softmax(attn_weights, axis=-1)
+
+        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
+        
+        attn_output_v = paddle.bmm(attn_probs_v, value_l_states)
+        attn_output_l = paddle.bmm(attn_probs_l, value_v_states)
+
+        if attn_output_v.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
+            raise ValueError(
+                f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.shape}"
+            )
+
+        if attn_output_l.shape != [bsz * self.num_heads, src_len, self.head_dim]:
+            raise ValueError(
+                f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.shape}"
+            )
+
+        attn_output_v = attn_output_v.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
+        attn_output_v = attn_output_v.transpose([0, 2, 1, 3])
+        attn_output_v = attn_output_v.reshape([bsz, tgt_len, self.embed_dim])
+
+        attn_output_l = attn_output_l.reshape([bsz, self.num_heads, src_len, self.head_dim])
+        attn_output_l = attn_output_l.transpose([0, 2, 1, 3])
+        attn_output_l = attn_output_l.reshape([bsz, src_len, self.embed_dim])
+
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+
+        return attn_output_v, attn_output_l
+
+
+# Bi-Direction MHA (text->image, image->text)
+class BiAttentionBlock(nn.Layer):
+    def __init__(
+        self,
+        v_dim,
+        l_dim,
+        embed_dim,
+        num_heads,
+        dropout=0.1,
+        drop_path=0.0,
+        init_values=1e-4,
+        cfg=None,
+    ):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super(BiAttentionBlock, self).__init__()
+
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(
+            v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout
+        )
+
+        # add layer scale for training stability
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.gamma_v = self.create_parameter(
+            shape=[v_dim], attr=paddle.ParamAttr(initializer=Constant(init_values)),
+        )
+        self.gamma_l = self.create_parameter(
+            shape=[l_dim], attr=paddle.ParamAttr(initializer=Constant(init_values)),
+        )
+
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        v = self.layer_norm_v(v)
+        l = self.layer_norm_l(l)
+        delta_v, delta_l = self.attn(
+            v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l
+        )
+        # v, l = v + delta_v, l + delta_l
+        v = v + self.drop_path(self.gamma_v * delta_v)
+        l = l + self.drop_path(self.gamma_l * delta_l)
+        return v, l
+
diff --git a/paddlevlp/models/groundingdino/layers.py b/paddlevlp/models/groundingdino/layers.py
new file mode 100644
index 00000000000000..fe7b072affc41b
--- /dev/null
+++ b/paddlevlp/models/groundingdino/layers.py
@@ -0,0 +1,256 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlenlp.utils.initializer import constant_,xavier_uniform_
+
+from itertools import repeat
+import collections.abc
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def _convert_attention_mask(attn_mask, dtype):
+    """
+    Convert the attention mask to the target dtype we expect.
+    Parameters:
+        attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False` 
+                values and the others have `True` values. When the data type is 
+                int, the unwanted positions have 0 values and the others have 1 
+                values. When the data type is float, the unwanted positions have 
+                `-INF` values and the others have 0 values. It can be None when 
+                nothing wanted or needed to be prevented attention to. Default None.
+        dtype (VarType): The target type of `attn_mask` we expect.
+    Returns:
+        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
+    """
+    return nn.layer.transformer._convert_attention_mask(attn_mask, dtype)
+
+
+class MultiHeadAttention(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in value. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
+            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 kdim=None,
+                 vdim=None,
+                 need_weights=False):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        if self._qkv_same_embed_dim:
+            self.in_proj_weight = self.create_parameter(
+                shape=[embed_dim, 3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=False)
+            self.in_proj_bias = self.create_parameter(
+                shape=[3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self.q_proj = nn.Linear(embed_dim, embed_dim)
+            self.k_proj = nn.Linear(self.kdim, embed_dim)
+            self.v_proj = nn.Linear(self.vdim, embed_dim)
+
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self._type_list = ('q_proj', 'k_proj', 'v_proj')
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+            else:
+                constant_(p)
+
+
+    def compute_qkv(self, tensor, index):
+        if self._qkv_same_embed_dim:
+            tensor = F.linear(
+                x=tensor,
+                weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
+                                           * self.embed_dim],
+                bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
+                                       self.embed_dim]
+                if self.in_proj_bias is not None else None)
+        else:
+            tensor = getattr(self, self._type_list[index])(tensor)
+        tensor = tensor.reshape(
+            [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        return tensor
+
+
+    def forward(self, query, key=None, value=None, attn_mask=None):
+        r"""
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        q, k, v = (self.compute_qkv(t, i)
+                   for i, t in enumerate([query, key, value]))
+
+        # scale dot product attention
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        scaling = float(self.head_dim)**-0.5
+        product = product * scaling
+
+        if attn_mask is not None:
+            # Support bool or int mask
+            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
+            product = product + attn_mask
+        weights = F.softmax(product)
+        if self.dropout:
+            weights = F.dropout(
+                weights,
+                self.dropout,
+                training=self.training,
+                mode="upscale_in_train")
+
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = paddle.transpose(out, perm=[0, 2, 1, 3])
+        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = paddle.bernoulli(paddle.full(shape, keep_prob, dtype=x.dtype))
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor = paddle.divide(random_tensor, paddle.to_tensor(keep_prob))
+    return x * random_tensor
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
\ No newline at end of file
diff --git a/paddlevlp/models/groundingdino/modeling.py b/paddlevlp/models/groundingdino/modeling.py
new file mode 100644
index 00000000000000..11f5fcf76559cc
--- /dev/null
+++ b/paddlevlp/models/groundingdino/modeling.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import List
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import Layer
+
+
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+from paddlenlp.utils.initializer import constant_,xavier_uniform_
+from paddlenlp.transformers import AutoTokenizer, BertModel, RobertaModel
+
+
+from .utils import MLP, ContrastiveEmbed,inverse_sigmoid
+
+from .bertwarper import (
+    BertModelWarper,
+    generate_masks_with_special_tokens,
+    generate_masks_with_special_tokens_and_transfer_map,
+)
+
+from .configuration import (
+    GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION,
+    GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP,
+    GroundingDinoConfig,
+)
+from .backbone import build_backbone
+from .transformer import build_transformer
+
+
+__all__ = [
+    "GroundingDinoModel",
+    "GroundingDinoPretrainedModel",
+]
+
+
+class GroundingDinoPretrainedModel(PretrainedModel):
+    """
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = "config.json"
+    config_class = GroundingDinoConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "groundding"
+
+    pretrained_init_configuration = GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP
+
+@register_base_model
+class GroundingDinoModel(GroundingDinoPretrainedModel):
+    """
+    Args:
+        config (:class:`GroundingDinoConfig`):
+            An instance of BertConfig used to construct BertModel.
+    """
+
+    def __init__(self, config: GroundingDinoConfig):
+        super(GroundingDinoModel, self).__init__(config)
+        
+        self.query_dim = config.query_dim
+        self.backbone = build_backbone(config)
+        self.transformer = build_transformer(config)
+        self.hidden_dim = hidden_dim = self.transformer.d_model
+        self.num_feature_levels = config.num_feature_levels
+        self.nheads = config.nheads
+        self.max_text_len = config.max_text_len
+        self.sub_sentence_present = config.sub_sentence_present
+
+        # bert
+        if config.text_encoder_type == "bert-base-uncased":
+            self.bert = BertModel.from_pretrained(config.text_encoder_type)
+        elif config.text_encoder_type == "roberta-base":
+            self.bert = RobertaModel.from_pretrained(config.text_encoder_type)
+        else:
+            raise ValueError("Unknown text_encoder_type {}".format(config.text_encoder_type))
+        self.bert.pooler.dense.weight.stop_gradient = True
+        self.bert.pooler.dense.bias.stop_gradient = True
+        self.bert = BertModelWarper(bert_model=self.bert)
+
+        self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias_attr=True)
+        constant_(self.feat_map.bias, 0)
+        xavier_uniform_(self.feat_map.weight)
+
+
+        # prepare input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(self.backbone.num_channels)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = self.backbone.num_channels[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2D(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2D(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                )
+                in_channels = hidden_dim
+            self.input_proj = nn.LayerList(input_proj_list)
+        else:
+            assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!"
+            self.input_proj = nn.LayerList(
+                [
+                    nn.Sequential(
+                        nn.Conv2D(self.backbone.num_channels[-1], hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                ]
+            )
+        
+         # prepare class & box embed
+        _class_embed = ContrastiveEmbed()
+
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        constant_(_bbox_embed.layers[-1].weight, 0)
+        constant_(_bbox_embed.layers[-1].bias, 0)
+
+        if config.dec_pred_bbox_embed_share:
+            box_embed_layerlist = [_bbox_embed for i in range(self.transformer.num_decoder_layers)]
+        else:
+            box_embed_layerlist = [
+                copy.deepcopy(_bbox_embed) for i in range(self.transformer.num_decoder_layers)
+            ]
+        class_embed_layerlist = [_class_embed for i in range(self.transformer.num_decoder_layers)]
+        self.bbox_embed = nn.LayerList(box_embed_layerlist)
+        self.class_embed = nn.LayerList(class_embed_layerlist)
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.class_embed = self.class_embed
+
+        # two stage
+        assert config.two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format(
+            config.two_stage_type
+        )
+        if config.two_stage_type != "no":
+            if config.two_stage_bbox_embed_share:
+                assert config.dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
+
+            if config.two_stage_class_embed_share:
+                assert config.dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
+
+            self.refpoint_embed = None
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            xavier_uniform_(proj[0].weight, gain=1)
+            constant_(proj[0].bias, 0)
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
+
+
+    def forward(
+        self,
+        x: paddle.Tensor, 
+        m: paddle.Tensor,
+        input_ids:paddle.Tensor,
+        attention_mask:paddle.Tensor,
+        text_self_attention_masks:paddle.Tensor,
+        position_ids:paddle.Tensor = None,
+        targets: List = None
+        
+    ):
+      
+        tokenized = {
+            "input_ids": input_ids,
+            "attention_mask":attention_mask,
+        }
+        
+        # extract text embeddings
+        if self.sub_sentence_present:
+            tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
+            tokenized_for_encoder["attention_mask"] = text_self_attention_masks
+            tokenized_for_encoder["position_ids"] = position_ids
+        else:
+            # import ipdb; ipdb.set_trace()
+            tokenized_for_encoder = tokenized
+        
+        bert_output = self.bert(**tokenized_for_encoder)  # bs, 195, 768
+     
+        encoded_text = self.feat_map(bert_output["last_hidden_state"])  # bs, 195, d_model
+        text_token_mask = tokenized["attention_mask"].cast(paddle.bool)  # bs, 195
+        # text_token_mask: True for nomask, False for mask
+        # text_self_attention_masks: True for nomask, False for mask
+        
+        if encoded_text.shape[1] > self.max_text_len:
+            encoded_text = encoded_text[:, : self.max_text_len, :]
+            text_token_mask = text_token_mask[:, : self.max_text_len]
+            position_ids = position_ids[:, : self.max_text_len]
+            text_self_attention_masks = text_self_attention_masks[
+                :, : self.max_text_len, : self.max_text_len
+            ]
+
+        text_dict = {
+            "encoded_text": encoded_text,  # bs, 195, d_model
+            "text_token_mask": text_token_mask,  # bs, 195
+            "position_ids": position_ids,  # bs, 195
+            "text_self_attention_masks": text_self_attention_masks,  # bs, 195,195
+        }
+       
+        features,feat_masks,poss = self.backbone(x,m)
+      
+        
+        srcs = []
+        masks = []
+        for l, src in enumerate(features):
+            # src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(feat_masks[l])
+            # assert mask is not None
+   
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    # src = self.input_proj[l](features[-1].tensors)
+                    src = self.input_proj[l](features[-1])
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                # m = samples.mask
+                mask = F.interpolate(m[None].cast(paddle.float32), size=src.shape[-2:]).cast(paddle.bool)[0]
+                # pos_l = self.backbone[1](NestedTensor(src, mask)).cast(src.dtype)
+                pos_l = self.backbone[1](mask).cast(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+        
+        input_query_bbox = input_query_label = attn_mask = dn_meta = None
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
+            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict
+        )
+       
+        # deformable-detr-like anchor update
+        outputs_coord_list = []
+        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(
+            zip(reference[:-1], self.bbox_embed, hs)
+        ):
+            layer_delta_unsig = layer_bbox_embed(layer_hs)
+            layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+            layer_outputs_unsig = F.sigmoid(layer_outputs_unsig)
+            outputs_coord_list.append(layer_outputs_unsig)
+        outputs_coord_list = paddle.stack(outputs_coord_list)
+       
+        # output
+        outputs_class = paddle.stack(
+            [
+                layer_cls_embed(layer_hs, text_dict)
+                for layer_cls_embed, layer_hs in zip(self.class_embed, hs)
+            ]
+        )
+
+        out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]}
+
+        return out
\ No newline at end of file
diff --git a/paddlevlp/models/groundingdino/ms_deform_attn.py b/paddlevlp/models/groundingdino/ms_deform_attn.py
new file mode 100644
index 00000000000000..6b0a43c37fc938
--- /dev/null
+++ b/paddlevlp/models/groundingdino/ms_deform_attn.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddlenlp.utils.initializer import constant_,xavier_uniform_
+
+
+
+# helpers
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n - 1) == 0) and n != 0
+
+
+def deformable_attention_core_func(value, value_spatial_shapes,
+                                   value_level_start_index, sampling_locations,
+                                   attention_weights):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor): [n_levels, 2]
+        value_level_start_index (Tensor): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
+
+    value_list = value.split(
+        value_spatial_shapes.prod(1).split(n_levels), axis=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(
+            [0, 2, 1]).reshape([bs * n_head, c, h, w])
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
+            [0, 2, 1, 3, 4]).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
+        [bs * n_head, 1, Len_q, n_levels * n_points])
+    output = (paddle.stack(
+        sampling_value_list, axis=-2).flatten(-2) *
+              attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
+
+    return output.transpose([0, 2, 1])
+
+class MSDeformableAttention(nn.Layer):
+    def __init__(self,
+                 embed_dim=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 lr_mult=0.1,
+                 batch_first=False):
+        """
+        Multi-Scale Deformable Attention Module
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.total_points = num_heads * num_levels * num_points
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(
+            embed_dim,
+            self.total_points * 2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+        try:
+            # use cuda op
+            from deformable_detr_ops import ms_deformable_attn
+        except:
+            # use paddle func
+            ms_deformable_attn = deformable_attention_core_func
+        self.ms_deformable_attn_core = ms_deformable_attn
+        self.batch_first = batch_first
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        constant_(self.sampling_offsets.weight)
+        thetas = paddle.arange(
+            self.num_heads,
+            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
+        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
+            [1, self.num_levels, self.num_points, 1])
+        scaling = paddle.arange(
+            1, self.num_points + 1,
+            dtype=paddle.float32).reshape([1, 1, -1, 1])
+        grid_init *= scaling
+        self.sampling_offsets.bias.set_value(grid_init.flatten())
+        # attention_weights
+        constant_(self.attention_weights.weight)
+        constant_(self.attention_weights.bias)
+        # proj
+        xavier_uniform_(self.value_proj.weight)
+        constant_(self.value_proj.bias)
+        xavier_uniform_(self.output_proj.weight)
+        constant_(self.output_proj.bias)
+
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+        assert int(value_spatial_shapes.prod(1).sum()) == Len_v
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.transpose([1, 0, 2])
+            value = value.permute([1, 0, 2])
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = (~value_mask).astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2])
+            sampling_locations = reference_points.reshape([
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ]) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] *
+                0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+
+        output = self.ms_deformable_attn_core(
+            value, value_spatial_shapes.astype('int64'), value_level_start_index.astype('int64'),
+            sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            output = output.transpose([1, 0, 2])
+
+        return output
\ No newline at end of file
diff --git a/paddlevlp/models/groundingdino/transformer.py b/paddlevlp/models/groundingdino/transformer.py
new file mode 100644
index 00000000000000..697e6a90626eb9
--- /dev/null
+++ b/paddlevlp/models/groundingdino/transformer.py
@@ -0,0 +1,970 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+
+from .utils import inverse_sigmoid
+from paddlenlp.utils.initializer import constant_,xavier_uniform_,normal_
+from .layers import MultiHeadAttention
+
+from .fuse_modules import BiAttentionBlock
+from .ms_deform_attn import MSDeformableAttention as MSDeformAttn
+from .transformer_vanilla import TransformerEncoderLayer
+from .utils import (
+    MLP,
+    _get_activation_fn,
+    _get_clones,
+    gen_encoder_output_proposals,
+    gen_sineembed_for_position,
+    get_sine_pos_embed,
+)
+
+
+class Transformer(nn.Layer):
+    def __init__(
+        self,
+        d_model=256,
+        nhead=8,
+        num_queries=300,
+        num_encoder_layers=6,
+        num_unicoder_layers=0,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.0,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+        query_dim=4,
+        num_patterns=0,
+        # for deformable encoder
+        num_feature_levels=1,
+        enc_n_points=4,
+        dec_n_points=4,
+        # init query
+        learnable_tgt_init=False,
+        # two stage
+        two_stage_type="no",  # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1']
+        embed_init_tgt=False,
+        # for text
+        use_text_enhancer=False,
+        use_fusion_layer=False,
+        use_checkpoint=False,
+        use_transformer_ckpt=False,
+        use_text_cross_attention=False,
+        text_dropout=0.1,
+        fusion_dropout=0.1,
+        fusion_droppath=0.0,
+    ):
+        super().__init__()
+        self.num_feature_levels = num_feature_levels
+        self.num_encoder_layers = num_encoder_layers
+        self.num_unicoder_layers = num_unicoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.num_queries = num_queries
+        assert query_dim == 4
+
+        # choose encoder layer type
+        encoder_layer = DeformableTransformerEncoderLayer(
+            d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points
+        )
+
+        if use_text_enhancer:
+            text_enhance_layer = TransformerEncoderLayer(
+                d_model=d_model,
+                nhead=nhead // 2,
+                dim_feedforward=dim_feedforward // 2,
+                dropout=text_dropout,
+            )
+        else:
+            text_enhance_layer = None
+
+        if use_fusion_layer:
+            feature_fusion_layer = BiAttentionBlock(
+                v_dim=d_model,
+                l_dim=d_model,
+                embed_dim=dim_feedforward // 2,
+                num_heads=nhead // 2,
+                dropout=fusion_dropout,
+                drop_path=fusion_droppath,
+            )
+        else:
+            feature_fusion_layer = None
+
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        assert encoder_norm is None
+        self.encoder = TransformerEncoder(
+            encoder_layer,
+            num_encoder_layers,
+            d_model=d_model,
+            num_queries=num_queries,
+            text_enhance_layer=text_enhance_layer,
+            feature_fusion_layer=feature_fusion_layer,
+            use_checkpoint=use_checkpoint,
+            use_transformer_ckpt=use_transformer_ckpt,
+        )
+
+        # choose decoder layer type
+        decoder_layer = DeformableTransformerDecoderLayer(
+            d_model,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_feature_levels,
+            nhead,
+            dec_n_points,
+            use_text_cross_attention=use_text_cross_attention,
+        )
+
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+            d_model=d_model,
+            query_dim=query_dim,
+            num_feature_levels=num_feature_levels,
+        )
+
+        self.d_model = d_model
+        self.nhead = nhead
+        self.dec_layers = num_decoder_layers
+        self.num_queries = num_queries  # useful for single stage model only
+        self.num_patterns = num_patterns
+        if not isinstance(num_patterns, int):
+            Warning("num_patterns should be int but {}".format(type(num_patterns)))
+            self.num_patterns = 0
+
+        if num_feature_levels > 1:
+            if self.num_encoder_layers > 0:
+                self.level_embed = self.create_parameter(shape=[num_feature_levels, d_model])
+                # self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+            else:
+                self.level_embed = None
+
+        self.learnable_tgt_init = learnable_tgt_init
+        assert learnable_tgt_init, "why not learnable_tgt_init"
+        self.embed_init_tgt = embed_init_tgt
+        if (two_stage_type != "no" and embed_init_tgt) or (two_stage_type == "no"):
+            self.tgt_embed = nn.Embedding(self.num_queries, d_model)
+            normal_(self.tgt_embed.weight)
+        else:
+            self.tgt_embed = None
+
+        # for two stage
+        self.two_stage_type = two_stage_type
+        assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format(
+            two_stage_type
+        )
+        if two_stage_type == "standard":
+            # anchor selection at the output of encoder
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+            self.two_stage_wh_embedding = None
+
+        if two_stage_type == "no":
+            self.init_ref_points(num_queries)  # init self.refpoint_embed
+
+        self.enc_out_class_embed = None
+        self.enc_out_bbox_embed = None
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+        for m in self.sublayers():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if self.num_feature_levels > 1 and self.level_embed is not None:
+            normal_(self.level_embed)
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = paddle.sum(~mask[:, :, 0], 1)
+        valid_W = paddle.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.cast(paddle.float32) / H
+        valid_ratio_w = valid_W.cast(paddle.float32) / W
+        valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
+
+    def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None):
+        """
+        Input:
+            - srcs: List of multi features [bs, ci, hi, wi]
+            - masks: List of multi masks [bs, hi, wi]
+            - refpoint_embed: [bs, num_dn, 4]. None in infer
+            - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
+            - tgt: [bs, num_dn, d_model]. None in infer
+
+        """
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shapes.append(paddle.to_tensor([h, w]))
+
+            src = src.flatten(2).transpose([0, 2, 1])  # bs, hw, c
+            mask = mask.cast(paddle.float32).flatten(1).cast(paddle.bool)  # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])  # bs, hw, c
+            if self.num_feature_levels > 1 and self.level_embed is not None:
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].reshape([1, 1, -1])
+            else:
+                lvl_pos_embed = pos_embed
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = paddle.concat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = paddle.concat(mask_flatten, 1)  # bs, \sum{hxw}
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)  # bs, \sum{hxw}, c
+        
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes), dtype=paddle.int32
+        )
+        
+        level_start_index = paddle.concat(
+            (paddle.zeros([1], dtype=spatial_shapes.dtype), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = paddle.stack([self.get_valid_ratio(m) for m in masks], 1)
+        
+        # two stage
+        enc_topk_proposals = enc_refpoint_embed = None
+
+        #########################################################
+        # Begin Encoder
+        #########################################################
+        memory, memory_text = self.encoder(
+            src_flatten,
+            pos=lvl_pos_embed_flatten,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            key_padding_mask=mask_flatten,
+            memory_text=text_dict["encoded_text"],
+            text_attention_mask=~text_dict["text_token_mask"],
+            # we ~ the mask . False means use the token; True means pad the token
+            position_ids=text_dict["position_ids"],
+            text_self_attention_masks=text_dict["text_self_attention_masks"],
+        )
+        #########################################################
+        # End Encoder
+        # - memory: bs, \sum{hw}, c
+        # - mask_flatten: bs, \sum{hw}
+        # - lvl_pos_embed_flatten: bs, \sum{hw}, c
+        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        #########################################################
+        text_dict["encoded_text"] = memory_text
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if memory.isnan().any() | memory.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+
+
+        if self.two_stage_type == "standard":
+            output_memory, output_proposals = gen_encoder_output_proposals(
+                memory, mask_flatten, spatial_shapes
+            )
+            output_memory = self.enc_output_norm(self.enc_output(output_memory))
+
+            if text_dict is not None:
+                enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
+            else:
+                enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
+
+            topk_logits = enc_outputs_class_unselected.max(-1)
+            enc_outputs_coord_unselected = (
+                self.enc_out_bbox_embed(output_memory) + output_proposals
+            )  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+
+            topk_proposals = paddle.topk(topk_logits, topk, axis=1)[1]  # bs, nq
+            
+            topk_ind = topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, 4])
+          
+            # gather boxes
+            refpoint_embed_undetach = paddle.take_along_axis(
+                arr=enc_outputs_coord_unselected,
+                axis=1,
+                indices=topk_ind)
+        
+            refpoint_embed_ = refpoint_embed_undetach.detach()
+            init_box_proposal = F.sigmoid(paddle.take_along_axis(
+                arr=output_proposals,
+                axis=1,
+                indices=topk_ind)) 
+    
+            tgt_undetach = paddle.take_along_axis(arr=output_memory, axis=1,indices=topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, self.d_model]))
+
+            # gather tgt
+            # tgt_undetach = paddle.gather_nd(output_memory, topk_ind)
+            if self.embed_init_tgt:
+                tgt_ = (
+                    self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2])
+                )  # nq, bs, d_model
+            else:
+                tgt_ = tgt_undetach.detach()
+
+            if refpoint_embed is not None:
+                refpoint_embed = paddle.concat([refpoint_embed, refpoint_embed_], axis=1)
+                tgt = paddle.concat([tgt, tgt_], axis=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+        elif self.two_stage_type == "no":
+            tgt_ = (
+                self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2])
+            )  # nq, bs, d_model
+            refpoint_embed_ = (
+                self.refpoint_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2])
+            )  # nq, bs, 4
+
+            if refpoint_embed is not None:
+                refpoint_embed = paddle.concat([refpoint_embed, refpoint_embed_], axis=1)
+                tgt = paddle.concat([tgt, tgt_], axis=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+            if self.num_patterns > 0:
+                tgt_embed = tgt.tile([1, self.num_patterns, 1])
+                refpoint_embed = refpoint_embed.tile([1, self.num_patterns, 1])
+                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(
+                    self.num_queries, 1
+                )  # 1, n_q*n_pat, d_model
+                tgt = tgt_embed + tgt_pat
+
+            init_box_proposal = F.sigmoid(refpoint_embed_)
+
+        else:
+            raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
+        #########################################################
+        # End preparing tgt
+        # - tgt: bs, NQ, d_model
+        # - refpoint_embed(unsigmoid): bs, NQ, d_model
+        #########################################################
+        
+        #########################################################
+        # Begin Decoder
+        #########################################################
+        hs, references = self.decoder(
+            tgt=tgt,
+            memory=memory,
+            memory_key_padding_mask=mask_flatten,
+            pos=lvl_pos_embed_flatten,
+            refpoints_unsigmoid=refpoint_embed,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            tgt_mask=attn_mask,
+            memory_text=text_dict["encoded_text"],
+            text_attention_mask=~text_dict["text_token_mask"],
+            # we ~ the mask . False means use the token; True means pad the token
+        )
+        #########################################################
+        # End Decoder
+        # hs: n_dec, bs, nq, d_model
+        # references: n_dec+1, bs, nq, query_dim
+        #########################################################
+
+        #########################################################
+        # Begin postprocess
+        #########################################################
+        if self.two_stage_type == "standard":
+            hs_enc = tgt_undetach.unsqueeze(0)
+            ref_enc = F.sigmoid(refpoint_embed_undetach).unsqueeze(0)
+        else:
+            hs_enc = ref_enc = None
+        #########################################################
+        # End postprocess
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
+        # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
+        #########################################################
+
+        return hs, references, hs_enc, ref_enc, init_box_proposal
+        # hs: (n_dec, bs, nq, d_model)
+        # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
+        # ref_enc: sigmoid coordinates. \
+        #           (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(
+        self,
+        encoder_layer,
+        num_layers,
+        d_model=256,
+        num_queries=300,
+        enc_layer_share=False,
+        text_enhance_layer=None,
+        feature_fusion_layer=None,
+        use_checkpoint=False,
+        use_transformer_ckpt=False,
+    ):
+        """_summary_
+
+        Args:
+            encoder_layer (_type_): _description_
+            num_layers (_type_): _description_
+            norm (_type_, optional): _description_. Defaults to None.
+            d_model (int, optional): _description_. Defaults to 256.
+            num_queries (int, optional): _description_. Defaults to 300.
+            enc_layer_share (bool, optional): _description_. Defaults to False.
+
+        """
+        super().__init__()
+        # prepare layers
+        self.layers = []
+        self.text_layers = []
+        self.fusion_layers = []
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
+
+            if text_enhance_layer is not None:
+                self.text_layers = _get_clones(
+                    text_enhance_layer, num_layers, layer_share=enc_layer_share
+                )
+            if feature_fusion_layer is not None:
+                self.fusion_layers = _get_clones(
+                    feature_fusion_layer, num_layers, layer_share=enc_layer_share
+                )
+        else:
+            self.layers = []
+            del encoder_layer
+
+            if text_enhance_layer is not None:
+                self.text_layers = []
+                del text_enhance_layer
+            if feature_fusion_layer is not None:
+                self.fusion_layers = []
+                del feature_fusion_layer
+
+        self.query_scale = None
+        self.num_queries = num_queries
+        self.num_layers = num_layers
+        self.d_model = d_model
+
+        self.use_checkpoint = False
+        self.use_transformer_ckpt = False
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+
+            ref_y, ref_x = paddle.meshgrid(
+                paddle.linspace(0.5, H_ - 0.5, H_, dtype=paddle.float32),
+                paddle.linspace(0.5, W_ - 0.5, W_, dtype=paddle.float32),
+            )
+            ref_y = ref_y.reshape([-1,])[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape([-1,])[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = paddle.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = paddle.concat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        # for images
+        src: paddle.Tensor,
+        pos: paddle.Tensor,
+        spatial_shapes: paddle.Tensor,
+        level_start_index: paddle.Tensor,
+        valid_ratios: paddle.Tensor,
+        key_padding_mask: paddle.Tensor,
+        # for texts
+        memory_text: paddle.Tensor = None,
+        text_attention_mask: paddle.Tensor = None,
+        pos_text: paddle.Tensor = None,
+        text_self_attention_masks: paddle.Tensor = None,
+        position_ids: paddle.Tensor = None,
+    ):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - key_padding_mask: [bs, sum(hi*wi)]
+
+            - memory_text: bs, n_text, 256
+            - text_attention_mask: bs, n_text
+                False for no padding; True for padding
+            - pos_text: bs, n_text, 256
+
+            - position_ids: bs, n_text
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_level, 2]
+        Outpus:
+            - output: [bs, sum(hi*wi), 256]
+        """
+
+        output = src
+
+        # preparation and reshape
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(
+                spatial_shapes, valid_ratios
+            )
+
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, text_dim = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = (
+                    paddle.arange(n_text)
+                    .cast(paddle.float32)
+                    .unsqueeze(0)
+                    .unsqueeze(-1)
+                    .tile([bs, 1, 1])
+                )
+                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_sine_pos_embed(
+                    position_ids[..., None], num_pos_feats=256, exchange_xy=False
+                )
+
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            # if output.isnan().any() or memory_text.isnan().any():
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+            if self.fusion_layers:
+                if self.use_checkpoint:
+                    output, memory_text = recompute(
+                        self.fusion_layers[layer_id],
+                        output,
+                        memory_text,
+                        key_padding_mask,
+                        text_attention_mask,
+                        **{"preserve_rng_state": True}
+                    )
+                else:
+                    output, memory_text = self.fusion_layers[layer_id](
+                        v=output,
+                        l=memory_text,
+                        attention_mask_v=key_padding_mask,
+                        attention_mask_l=text_attention_mask,
+                    )
+
+            if self.text_layers:
+                memory_text = self.text_layers[layer_id](
+                    src=memory_text,
+                    src_mask=text_self_attention_masks,  # note we use ~ for mask here
+                    src_key_padding_mask=text_attention_mask,
+                    pos=(pos_text if pos_text is not None else None),
+                )
+
+            # main process
+            if self.use_transformer_ckpt:
+                output = recompute(
+                    layer,
+                    output,
+                    pos,
+                    reference_points,
+                    spatial_shapes,
+                    level_start_index,
+                    key_padding_mask,
+                    **{"preserve_rng_state": True}
+                )
+            else:
+                output = layer(
+                    src=output,
+                    pos=pos,
+                    reference_points=reference_points,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    key_padding_mask=key_padding_mask,
+                )
+
+        return output, memory_text
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(
+        self,
+        decoder_layer,
+        num_layers,
+        norm=None,
+        return_intermediate=False,
+        d_model=256,
+        query_dim=4,
+        num_feature_levels=1,
+    ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer, num_layers)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate, "support return_intermediate only"
+        self.query_dim = query_dim
+        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
+        self.num_feature_levels = num_feature_levels
+
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        self.query_pos_sine_scale = None
+
+        self.query_scale = None
+        self.bbox_embed = None
+        self.class_embed = None
+
+        self.d_model = d_model
+
+        self.ref_anchor_head = None
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[paddle.Tensor] = None,
+        memory_mask: Optional[paddle.Tensor] = None,
+        tgt_key_padding_mask: Optional[paddle.Tensor] = None,
+        memory_key_padding_mask: Optional[paddle.Tensor] = None,
+        pos: Optional[paddle.Tensor] = None,
+        refpoints_unsigmoid: Optional[paddle.Tensor] = None,  # num_queries, bs, 2
+        # for memory
+        level_start_index: Optional[paddle.Tensor] = None,  # num_levels
+        spatial_shapes: Optional[paddle.Tensor] = None,  # bs, num_levels, 2
+        valid_ratios: Optional[paddle.Tensor] = None,
+        # for text
+        memory_text: Optional[paddle.Tensor] = None,
+        text_attention_mask: Optional[paddle.Tensor] = None,
+    ):
+        """
+        Input:
+            - tgt: nq, bs, d_model
+            - memory: hw, bs, d_model
+            - pos: hw, bs, d_model
+            - refpoints_unsigmoid: nq, bs, 2/4
+            - valid_ratios/spatial_shapes: bs, nlevel, 2
+        """
+        output = tgt
+
+        intermediate = []
+        reference_points = F.sigmoid(refpoints_unsigmoid)
+        ref_points = [reference_points]
+    
+        for layer_id, layer in enumerate(self.layers):
+            
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None]
+                    * paddle.concat([valid_ratios, valid_ratios], -1)[None, :]
+                )  # nq, bs, nlevel, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+            query_sine_embed = gen_sineembed_for_position(
+                reference_points_input[:, :, 0, :]
+            )  # nq, bs, 256*2
+
+            # conditional query
+            raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(output) if self.query_scale is not None else 1
+            query_pos = pos_scale * raw_query_pos
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if query_pos.isnan().any() | query_pos.isinf().any():
+            #         import ipdb; ipdb.set_trace()
+            
+            # main process
+            output = layer(
+                tgt=output,
+                tgt_query_pos=query_pos,
+                tgt_query_sine_embed=query_sine_embed,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_reference_points=reference_points_input,
+                memory_text=memory_text,
+                text_attention_mask=text_attention_mask,
+                memory=memory,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_level_start_index=level_start_index,
+                memory_spatial_shapes=spatial_shapes,
+                memory_pos=pos,
+                self_attn_mask=tgt_mask,
+                cross_attn_mask=memory_mask,
+            )
+           
+            if (output.isnan().any() | output.isinf().any()) and paddle.in_dynamic_mode():
+                print(f"output layer_id {layer_id} is nan")
+                try:
+                    num_nan = output.isnan().sum().item()
+                    num_inf = output.isinf().sum().item()
+                    print(f"num_nan {num_nan}, num_inf {num_inf}")
+                except Exception as e:
+                    print(e)
+                    # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+                    #     import ipdb; ipdb.set_trace()
+
+            # iter update
+            if self.bbox_embed is not None:
+                # box_holder = self.bbox_embed(output)
+                # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
+                # new_reference_points = box_holder[..., :self.query_dim].sigmoid()
+
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](output)
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = F.sigmoid(outputs_unsig)
+
+                reference_points = new_reference_points.detach()
+                # if layer_id != self.num_layers - 1:
+                ref_points.append(new_reference_points)
+               
+            intermediate.append(self.norm(output))
+
+        return [
+            [itm_out for itm_out in intermediate],
+            [itm_refpoint for itm_refpoint in ref_points],
+        ]
+
+
+class DeformableTransformerEncoderLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation="relu",
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+    ):
+        super().__init__()
+
+        # self attention
+        self.self_attn = MSDeformAttn(
+            embed_dim=d_model,
+            num_levels=n_levels,
+            num_heads=n_heads,
+            num_points=n_points,
+            batch_first=True
+        )
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(
+        self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None
+    ):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+
+        src2 = self.self_attn(
+            query=self.with_pos_embed(src, pos),
+            reference_points=reference_points,
+            value=src,
+            value_spatial_shapes=spatial_shapes,
+            value_level_start_index=level_start_index,
+            value_mask=key_padding_mask,
+        )
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class DeformableTransformerDecoderLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation="relu",
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+        use_text_feat_guide=False,
+        use_text_cross_attention=False,
+    ):
+        super().__init__()
+     
+        # cross attention
+        self.cross_attn = MSDeformAttn(
+            embed_dim=d_model,
+            num_levels=n_levels,
+            num_heads=n_heads,
+            num_points=n_points,
+            batch_first=True
+        )
+        self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention text
+        if use_text_cross_attention:
+            self.ca_text = MultiHeadAttention(d_model, n_heads, dropout=dropout)
+            self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+            self.catext_norm = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.key_aware_proj = None
+        self.use_text_feat_guide = use_text_feat_guide
+        assert not use_text_feat_guide
+        self.use_text_cross_attention = use_text_cross_attention
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        # with paddle.amp.auto_cast(enable=False):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(
+        self,
+        # for tgt
+        tgt: Optional[paddle.Tensor],  # nq, bs, d_model
+        tgt_query_pos: Optional[paddle.Tensor] = None,  # pos for query. MLP(Sine(pos))
+        tgt_query_sine_embed: Optional[paddle.Tensor] = None,  # pos for query. Sine(pos)
+        tgt_key_padding_mask: Optional[paddle.Tensor] = None,
+        tgt_reference_points: Optional[paddle.Tensor] = None,  # nq, bs, 4
+        memory_text: Optional[paddle.Tensor] = None,  # bs, num_token, d_model
+        text_attention_mask: Optional[paddle.Tensor] = None,  # bs, num_token
+        # for memory
+        memory: Optional[paddle.Tensor] = None,  # hw, bs, d_model
+        memory_key_padding_mask: Optional[paddle.Tensor] = None,
+        memory_level_start_index: Optional[paddle.Tensor] = None,  # num_levels
+        memory_spatial_shapes: Optional[paddle.Tensor] = None,  # bs, num_levels, 2
+        memory_pos: Optional[paddle.Tensor] = None,  # pos for memory
+        # sa
+        self_attn_mask: Optional[paddle.Tensor] = None,  # mask used for self-attention
+        cross_attn_mask: Optional[paddle.Tensor] = None,  # mask used for cross-attention
+    ):
+        """
+        Input:
+            - tgt/tgt_query_pos: nq, bs, d_model
+            -
+        """
+        assert cross_attn_mask is None
+
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask if self_attn_mask is None else ~self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+
+        if self.use_text_cross_attention:
+            tgt2 = self.ca_text(
+                self.with_pos_embed(tgt, tgt_query_pos),
+                memory_text,
+                memory_text,
+                attn_mask=~text_attention_mask,
+            )[0]
+            tgt = tgt + self.catext_dropout(tgt2)
+            tgt = self.catext_norm(tgt)
+     
+        tgt2 = self.cross_attn(
+            query=self.with_pos_embed(tgt, tgt_query_pos),
+            reference_points=tgt_reference_points,
+            value=memory,
+            value_spatial_shapes=memory_spatial_shapes,
+            value_level_start_index=memory_level_start_index,
+            value_mask=memory_key_padding_mask,
+        )
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
+
+
+def build_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        num_queries=args.num_queries,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+        query_dim=args.query_dim,
+        activation=args.transformer_activation,
+        num_patterns=args.num_patterns,
+        num_feature_levels=args.num_feature_levels,
+        enc_n_points=args.enc_n_points,
+        dec_n_points=args.dec_n_points,
+        learnable_tgt_init=True,
+        # two stage
+        two_stage_type=args.two_stage_type,  # ['no', 'standard', 'early']
+        embed_init_tgt=args.embed_init_tgt,
+        use_text_enhancer=args.use_text_enhancer,
+        use_fusion_layer=args.use_fusion_layer,
+        use_checkpoint=args.use_checkpoint,
+        use_transformer_ckpt=args.use_transformer_ckpt,
+        use_text_cross_attention=args.use_text_cross_attention,
+        text_dropout=args.text_dropout,
+        fusion_dropout=args.fusion_dropout,
+        fusion_droppath=args.fusion_droppath,
+    )
diff --git a/paddlevlp/models/groundingdino/transformer_vanilla.py b/paddlevlp/models/groundingdino/transformer_vanilla.py
new file mode 100644
index 00000000000000..a671c0e87c54c7
--- /dev/null
+++ b/paddlevlp/models/groundingdino/transformer_vanilla.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import paddle
+from paddle import Tensor, nn
+import paddle.nn.functional as F
+from .layers import MultiHeadAttention
+
+
+from .utils import (
+    MLP,
+    _get_activation_fn,
+    _get_clones,
+    gen_encoder_output_proposals,
+    gen_sineembed_for_position,
+    sigmoid_focal_loss,
+)
+
+
+class TextTransformer(nn.Layer):
+    def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
+        super().__init__()
+        self.num_layers = num_layers
+        self.d_model = d_model
+        self.nheads = nheads
+        self.dim_feedforward = dim_feedforward
+        self.norm = None
+
+        single_encoder_layer = TransformerEncoderLayer(
+            d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout
+        )
+        self.layers = _get_clones(single_encoder_layer, num_layers)
+
+    def forward(self, memory_text: paddle.Tensor, text_attention_mask: paddle.Tensor):
+        """
+
+        Args:
+            text_attention_mask: bs, num_token
+            memory_text: bs, num_token, d_model
+
+        Raises:
+            RuntimeError: _description_
+
+        Returns:
+            output: bs, num_token, d_model
+        """
+
+        output = memory_text
+
+        for layer in self.layers:
+            output = layer(output, src_key_padding_mask=text_attention_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.self_attn = MultiHeadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self.nhead = nhead
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        # repeat attn mask
+        if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
+            # bs, num_q, num_k
+            src_mask = src_mask.tile([self.nhead, 1, 1])
+
+        q = k = self.with_pos_embed(src, pos)
+
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
+
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
diff --git a/paddlevlp/models/groundingdino/utils.py b/paddlevlp/models/groundingdino/utils.py
new file mode 100644
index 00000000000000..b55987720b5e4a
--- /dev/null
+++ b/paddlevlp/models/groundingdino/utils.py
@@ -0,0 +1,270 @@
+import copy
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+def inverse_sigmoid(x, eps=1e-3):
+    x = x.clip(min=0, max=1)
+    x1 = x.clip(min=eps)
+    x2 = (1 - x).clip(min=eps)
+    return paddle.log(x1 / x2)
+
+def _get_clones(module, N, layer_share=False):
+  
+    if layer_share:
+        return nn.LayerList([module for i in range(N)])
+    else:
+        return nn.LayerList([copy.deepcopy(module) for i in range(N)])
+
+
+def get_sine_pos_embed(
+    pos_tensor: paddle.Tensor,
+    num_pos_feats: int = 128,
+    temperature: int = 10000,
+    exchange_xy: bool = True,
+):
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (paddle.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = paddle.arange(num_pos_feats)
+    dim_t = temperature ** (2. * paddle.floor_divide(dim_t, paddle.to_tensor(2)) / num_pos_feats)
+
+    def sine_func(x: paddle.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = paddle.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), axis=3).flatten(2)
+        return sin_x
+
+    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], axis=-1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = paddle.concat(pos_res, axis=-1)
+    return pos_res
+
+
+def gen_encoder_output_proposals(
+    memory: paddle.Tensor, memory_padding_mask: paddle.Tensor, spatial_shapes: paddle.Tensor, learnedwh=None
+):
+    """
+    Input:
+        - memory: bs, \sum{hw}, d_model
+        - memory_padding_mask: bs, \sum{hw}
+        - spatial_shapes: nlevel, 2
+        - learnedwh: 2
+    Output:
+        - output_memory: bs, \sum{hw}, d_model
+        - output_proposals: bs, \sum{hw}, 4
+    """
+    N_, S_, C_ = memory.shape
+    proposals = []
+    _cur = 0
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].reshape([N_, H_, W_, 1])
+        valid_H = paddle.sum(~mask_flatten_[:, :, 0, 0], 1)
+        valid_W = paddle.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+        # import ipdb; ipdb.set_trace()
+
+        grid_y, grid_x = paddle.meshgrid(
+            paddle.linspace(0, H_ - 1, H_, dtype=paddle.float32),
+            paddle.linspace(0, W_ - 1, W_, dtype=paddle.float32),
+        )
+        grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
+
+        scale = paddle.concat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).reshape([N_, 1, 1, 2])
+        grid = (grid.unsqueeze(0).tile([N_, 1, 1, 1]) + 0.5) / scale
+
+        if learnedwh is not None:
+            # import ipdb; ipdb.set_trace()
+            wh = paddle.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl)
+        else:
+            wh = paddle.ones_like(grid) * 0.05 * (2.0**lvl)
+
+        # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
+        # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        # wh = torch.ones_like(grid) / scale
+        proposal = paddle.concat((grid, wh), -1).reshape([N_, -1, 4])
+        proposals.append(proposal)
+        _cur += H_ * W_
+    # import ipdb; ipdb.set_trace()
+    output_proposals = paddle.concat(proposals, 1)
+    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(
+        -1, keepdim=True
+    )
+    output_proposals = paddle.log(output_proposals / (1 - output_proposals))  # unsigmoid
+    output_proposals = masked_fill(output_proposals, memory_padding_mask.unsqueeze(-1), float("inf"))
+    output_proposals = masked_fill(output_proposals, ~output_proposals_valid, float("inf"))
+
+    output_memory = memory
+    output_memory = masked_fill(output_memory, memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = masked_fill(output_memory, ~output_proposals_valid, float(0))
+
+    # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
+
+    return output_memory, output_proposals
+
+
+class RandomBoxPerturber:
+    def __init__(
+        self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2
+    ) -> None:
+        self.noise_scale = paddle.to_tensor(
+            [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale]
+        )
+
+    def __call__(self, refanchors: paddle.Tensor) -> paddle.Tensor:
+        nq, bs, query_dim = refanchors.shape
+
+        noise_raw = paddle.rand(shape=refanchors.shape, dtype=refanchors.dtype)
+        noise_scale = self.noise_scale[:query_dim]
+
+        new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
+        return new_refanchors.clip(0, 1)
+
+
+def sigmoid_focal_loss(
+    inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False
+):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if no_reduction:
+        return loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class MLP(nn.Layer):
+    """Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.LayerList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def _get_activation_fn(activation, d_model=256, batch_dim=0):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    if activation == "prelu":
+        return nn.PReLU()
+    if activation == "selu":
+        return F.selu
+
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+
+
+def gen_sineembed_for_position(pos_tensor):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = paddle.arange(128)
+    dim_t = 10000 ** (2 * (paddle.floor_divide(dim_t, paddle.to_tensor(2))) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = paddle.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), axis=3).flatten(2)
+    pos_y = paddle.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), axis=3).flatten(2)
+    if pos_tensor.shape[-1] == 2:
+        pos = paddle.concat((pos_y, pos_x), aixs=2)
+    elif pos_tensor.shape[-1] == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = paddle.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), axis=3).flatten(2)
+
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = paddle.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), axis=3).flatten(2)
+
+        pos = paddle.concat((pos_y, pos_x, pos_w, pos_h), axis=2)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.shape[-1]))
+    return pos
+
+
+class ContrastiveEmbed(nn.Layer):
+    def __init__(self, max_text_len=256):
+        """
+        Args:
+            max_text_len: max length of text.
+        """
+        super().__init__()
+        self.max_text_len = max_text_len
+
+    def forward(self, x, text_dict):
+        """_summary_
+
+        Args:
+            x (_type_): _description_
+            text_dict (_type_): _description_
+            {
+                'encoded_text': encoded_text, # bs, 195, d_model
+                'text_token_mask': text_token_mask, # bs, 195
+                        # True for used tokens. False for padding tokens
+            }
+        Returns:
+            _type_: _description_
+        """
+        assert isinstance(text_dict, dict)
+
+        y = text_dict["encoded_text"]
+        text_token_mask = text_dict["text_token_mask"]
+
+        res = x @ y.transpose([0, 2, 1])
+        masked_fill(res, ~text_token_mask[:, None, :], float("-inf"))
+
+        # padding to max_text_len
+        new_res = paddle.full((*res.shape[:-1], self.max_text_len), float("-inf"))
+        new_res[..., : res.shape[-1]] = res
+
+        return new_res
diff --git a/paddlevlp/processors/__init__.py b/paddlevlp/processors/__init__.py
index 04006999f0b629..e3a4f252ceed86 100644
--- a/paddlevlp/processors/__init__.py
+++ b/paddlevlp/processors/__init__.py
@@ -16,3 +16,4 @@
 from .blip_processing import *
 from .minigpt4_processing import *
 from .minigpt4_image_processing import *
+from .groundingdino_processing import *
diff --git a/paddlevlp/processors/groundingdino_processing.py b/paddlevlp/processors/groundingdino_processing.py
new file mode 100644
index 00000000000000..eba2e15f0558e3
--- /dev/null
+++ b/paddlevlp/processors/groundingdino_processing.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for GroundingDino.
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import paddle
+import paddle.vision.transforms as T
+from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding,
+                                                         TensorType, TextInput)
+
+from .base_processing import ProcessorMixin
+
+from .image_utils import (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD,valid_images)
+from .processing_utils import (BaseImageProcessor, BaseTextProcessor)
+from paddlenlp.taskflow.utils import pad_batch_data
+from .utils import _max_by_axis
+
+__all__ = [
+    "GroudingDinoProcessor",
+    "GroudingDinoImageProcessor",
+    "GroudingDinoTextProcessor",
+]
+
+
+class GroudingDinoProcessor(ProcessorMixin):
+   
+    attributes = ["image_processor", "text_processor", "tokenizer"]
+    image_processor_class = "GroudingDinoImageProcessor"
+    text_processor_class = "GroudingDinoTextProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, text_processor, tokenizer):
+        super().__init__(image_processor, text_processor, tokenizer)
+
+    def __call__(
+        self,
+        images=None,
+        text: str = None,
+        **kwargs,
+    ) :
+        
+        if images is None or text is None:
+            raise ValueError("You have to specify either images and text.")
+
+        self.prompt = self.text_processor.pre_caption(text)
+        input_ids = self.tokenizer([self.prompt]).input_ids
+        specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
+        tokenized_out = self.text_processor(input_ids, specical_tokens)
+       
+        image_tensor,mask = self.image_processor(images)
+
+        return image_tensor,mask,tokenized_out
+
+    def decode(self, posmap):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        assert isinstance(posmap, paddle.Tensor), "posmap must be paddle.Tensor"
+        tokenized = self.tokenizer(self.prompt)
+        if posmap.dim() == 1:
+            non_zero_idx = posmap.nonzero(as_tuple=True)[0].squeeze(-1).tolist()
+            token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
+            return self.tokenizer.decode(token_ids)
+        else:
+            raise NotImplementedError("posmap must be 1-dim")
+
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+class GroudingDinoTextProcessor(BaseTextProcessor):
+    r"""
+    Constructs a GroudingDino text processor.
+    """
+
+    def __init__(
+        self,
+        max_words: int = 256,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+    
+        self.max_words = max_words
+        self.caption = None
+        
+
+    def __call__(
+        self,
+        input_ids,
+        special_tokens_list,
+        **kwargs,
+    ):
+        """
+        Preprocess the text with tokenization.
+        """
+        tokenized_out = {}
+        input_ids = pad_batch_data(input_ids)
+        input_ids = paddle.to_tensor(input_ids, dtype = paddle.int64).squeeze(-1)
+        tokenized_out['input_ids'] = input_ids
+        tokenized_out['attention_mask'] = paddle.cast(input_ids != 0, paddle.int64)
+        
+        (
+         text_self_attention_masks,
+         position_ids,
+         cate_to_token_mask_list,
+        ) = self.generate_masks_with_special_tokens_and_transfer_map(tokenized_out,special_tokens_list)
+
+        if text_self_attention_masks.shape[1] > self.max_words:
+            text_self_attention_masks = text_self_attention_masks[
+                :, : self.max_words, : self.max_words
+            ]
+            position_ids = position_ids[:, : self.max_words]
+            tokenized_out["input_ids"] = tokenized_out["input_ids"][:, : self.max_words]
+            tokenized_out["attention_mask"] = tokenized_out["attention_mask"][:, : self.max_words]
+        tokenized_out['position_ids'] = position_ids
+        tokenized_out['text_self_attention_masks'] =text_self_attention_masks
+
+        return tokenized_out
+
+    def pre_caption(self, caption: str) -> str:
+        """
+        Preprocess the text before tokenization.
+        """
+        caption = caption.strip()
+        if not caption.endswith("."):
+            caption = caption + "."
+        self.caption = caption
+        return caption
+
+    def generate_masks_with_special_tokens_and_transfer_map(self,tokenized,special_tokens_list):
+        """Generate attention mask between each pair of special tokens
+        Args:
+            input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+            special_tokens_mask (list): special tokens mask.
+        Returns:
+            torch.Tensor: attention mask between each special tokens.
+        """
+        input_ids = tokenized["input_ids"]
+        bs, num_token = input_ids.shape
+        # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+        special_tokens_mask = paddle.zeros((bs, num_token), dtype=paddle.bool)
+        for special_token in special_tokens_list:
+            special_tokens_mask |= input_ids == special_token
+
+        # idxs: each row is a list of indices of special tokens
+        idxs = paddle.nonzero(special_tokens_mask)
+
+        # generate attention mask and positional ids
+        attention_mask = (
+            paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1])
+        )
+        position_ids = paddle.zeros((bs, num_token))
+        cate_to_token_mask_list = [[] for _ in range(bs)]
+        previous_col = 0
+
+        for i in range(idxs.shape[0]):
+            row, col = idxs[i]
+            if (col == 0) or (col == num_token - 1):
+                attention_mask[row, col, col] = True
+                position_ids[row, col] = 0
+            else:
+                attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+                position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col)
+                c2t_maski = paddle.zeros([num_token,]).cast(paddle.bool)
+                c2t_maski[previous_col + 1 : col] = True
+                cate_to_token_mask_list[row].append(c2t_maski)
+            previous_col = col
+
+
+        return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list
+
+class GroudingDinoImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a GroudingDino image processor.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: List[int] = None,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_nested: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else 800
+
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = (
+            image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        )
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_nested = do_nested
+
+
+    def resize(self,
+               image,
+               target=None,
+               size=None, 
+               max_size=1333):
+
+        def get_size_with_aspect_ratio(image_size, size, max_size=None):
+            w, h = image_size
+            if max_size is not None:
+                min_original_size = float(min((w, h)))
+                max_original_size = float(max((w, h)))
+                if max_original_size / min_original_size * size > max_size:
+                    size = int(round(max_size * min_original_size / max_original_size))
+
+            if (w <= h and w == size) or (h <= w and h == size):
+                return (h, w)
+
+            if w < h:
+                ow = size
+                oh = int(size * h / w)
+            else:
+                oh = size
+                ow = int(size * w / h)
+
+            return (oh, ow)
+
+        def get_size(image_size, size, max_size=None):
+            if isinstance(size, (list, tuple)):
+                return size[::-1]
+            else:
+                return get_size_with_aspect_ratio(image_size, size, max_size)
+
+        size = get_size(image.size, size, max_size)
+        rescaled_image = T.resize(image, size)
+
+        if target is None:
+            return rescaled_image
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+        ratio_width, ratio_height = ratios
+
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * paddle.to_tensor(
+                [ratio_width, ratio_height, ratio_width, ratio_height]
+            )
+            target["boxes"] = scaled_boxes
+
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+
+        h, w = size
+        target["size"] = paddle.to_tensor([h, w])
+
+        if "masks" in target:
+            target["masks"] = (
+                interpolate(target["masks"][:, None].cast(paddle.float32), size, mode="nearest")[:, 0] > 0.5
+            )
+
+        return rescaled_image, target
+
+    def nested_tensor_from_tensor_list(self,tensor_list: List[paddle.Tensor]):
+        # TODO make this more general
+        if tensor_list[0].ndim == 3:
+
+            # TODO make it support different-sized images
+            max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+            # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+            batch_shape = [len(tensor_list)] + max_size
+            b, c, h, w = batch_shape
+            dtype = tensor_list[0].dtype
+            tensor = paddle.zeros(batch_shape, dtype=dtype)
+            mask = paddle.ones((b, h, w), dtype=paddle.bool)
+            for i in range(b):
+                img = tensor_list[i]
+                tensor[i, :img.shape[0], :img.shape[1], :img.shape[2]] = img
+                mask[i, :img.shape[1], :img.shape[2]] = False
+        else:
+            raise ValueError("not supported")
+        return tensor, mask
+
+
+    def preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_nested: bool = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_nested = do_nested if do_nested is not None else self.do_nested
+
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = size if size is not None else self.size
+
+        if not isinstance(images, (list, tuple)):
+            images = [images]
+
+        if isinstance(images[0], str):
+            images = [load_image(image) for image in images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "paddle.Tensor."
+            )
+
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError(
+                "Image mean and std must be specified if do_normalize is True."
+            )
+        
+        if do_resize:
+            images = [
+                    T.to_tensor(self.resize(image=image, size=size))
+                    for image in images
+                ]
+
+        if do_normalize:
+            images = T.normalize(images, mean=image_mean, std=image_std)
+
+        if do_nested:
+            tensors, masks = self.nested_tensor_from_tensor_list(images)
+       
+        return tensors, masks
+
+
diff --git a/paddlevlp/processors/utils.py b/paddlevlp/processors/utils.py
index 896c4bcd24820b..d340dacbbcec90 100644
--- a/paddlevlp/processors/utils.py
+++ b/paddlevlp/processors/utils.py
@@ -24,3 +24,12 @@ def _missing_(cls, value):
         raise ValueError(
             f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
         )
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes

From 776103061f9263ac8f8bbf6719708a215dd926c5 Mon Sep 17 00:00:00 2001
From: LokeZhou <aishenghuoaiqq@163.com>
Date: Mon, 3 Jul 2023 12:45:12 +0000
Subject: [PATCH 07/10] fix pr comment

---
 paddlevlp/examples/groundingdino/README.md    |   7 +-
 .../examples/groundingdino/run_predict.py     |  91 +++++++---
 paddlevlp/models/groundingdino/__init__.py    |  12 +-
 .../models/groundingdino/backbone/backbone.py |   3 +-
 .../backbone/position_encoding.py             |  88 ----------
 .../backbone/swin_transformer.py              | 165 +++---------------
 paddlevlp/models/groundingdino/bert_model.py  |  85 ++-------
 paddlevlp/models/groundingdino/bertwarper.py  |   1 -
 .../models/groundingdino/configuration.py     |  87 +++------
 .../models/groundingdino/fuse_modules.py      |   1 -
 paddlevlp/models/groundingdino/modeling.py    |  10 +-
 paddlevlp/models/groundingdino/transformer.py |  25 +--
 paddlevlp/models/groundingdino/utils.py       |  10 +-
 13 files changed, 152 insertions(+), 433 deletions(-)

diff --git a/paddlevlp/examples/groundingdino/README.md b/paddlevlp/examples/groundingdino/README.md
index d2a004578e15a7..58ccf1541bd7e8 100644
--- a/paddlevlp/examples/groundingdino/README.md
+++ b/paddlevlp/examples/groundingdino/README.md
@@ -16,10 +16,9 @@ python setup_ms_deformable_attn_op.py install
 ```
 ## 2.2 dynamic inference
 ```bash
-python3.8 run_predict.py -dt groundingdino-swint-ogc 
--i image_you_want_to_detect.jpg \
--o "dir you want to save the output" \
--t "Detect Cat"
+python3.8 run_predict.py 
+--input_imag image_you_want_to_detect.jpg \
+--prompt "cat" \
 ```
 
 
diff --git a/paddlevlp/examples/groundingdino/run_predict.py b/paddlevlp/examples/groundingdino/run_predict.py
index f461caac41cf3d..6953f1927c0cba 100644
--- a/paddlevlp/examples/groundingdino/run_predict.py
+++ b/paddlevlp/examples/groundingdino/run_predict.py
@@ -1,4 +1,4 @@
-import argparse
+from dataclasses import dataclass, field
 import os
 import numpy as np
 import paddle
@@ -7,6 +7,8 @@
 from paddlevlp.processors.groundingdino_processing import GroudingDinoProcessor
 from paddlevlp.models.groundingdino.modeling import GroundingDinoModel
 from PIL import Image, ImageDraw, ImageFont
+from paddlenlp.trainer import PdArgumentParser
+from paddlevlp.utils.log import logger
 
 
 def plot_boxes_to_image(image_pil, tgt):
@@ -49,39 +51,74 @@ def plot_boxes_to_image(image_pil, tgt):
 
     return image_pil, mask
 
-def main():
-    parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
-    parser.add_argument("--dino_type", "-dt", type=str, default="groundingdino-swint-ogc", help="dino type")
-    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
-    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
-    parser.add_argument(
-        "--output_dir", "-o", type=str, default="outputs", help="output directory"
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    input_image: str = field(
+        metadata={"help": "The name of input image."}
+    )  
+    prompt: str = field(
+        default=None, metadata={"help": "The prompt of the image to be generated."}
+    )  
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default="GroundingDino/groundingdino-swint-ogc",
+        metadata={"help": "Path to pretrained model or model identifier"},
     )
-
-    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
-    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
-    parser.add_argument(
-        "--visual",
-        type=eval,
+    box_threshold: float = field(
+        default=0.3,
+        metadata={
+            "help": "box threshold."
+        },
+    )
+    text_threshold: float = field(
+        default=0.25,
+        metadata={
+            "help": "text threshold."
+        },
+    )
+    output_dir: str = field(
+        default="output",
+        metadata={
+            "help": "output directory."
+        },
+    )
+    visual: bool = field(
         default=True,
+        metadata={
+            "help": "save visual image."
+        },
     )
-    
-
-    args = parser.parse_args()
 
+def main():
+    parser = PdArgumentParser((ModelArguments, DataArguments))
+    model_args, data_args = parser.parse_args_into_dataclasses()
 
     #bulid processor
     processor = GroudingDinoProcessor.from_pretrained(
         'bert-base-uncased'
     ) 
     #bulid model
-    print(f'dino_model {args.dino_type}')
-    dino_model = GroundingDinoModel.from_pretrained(args.dino_type)
+    logger.info("dino_model: {}".format(model_args.model_name_or_path))
+    dino_model = GroundingDinoModel.from_pretrained(model_args.model_name_or_path)
 
     #read image
-    image_pil = Image.open(args.image_path).convert("RGB")
+    image_pil = Image.open(data_args.input_image).convert("RGB")
     #preprocess image text_prompt
-    image_tensor,mask,tokenized_out = processor(images=image_pil,text=args.text_prompt)
+    image_tensor,mask,tokenized_out = processor(images=image_pil,text=data_args.prompt)
 
     with paddle.no_grad():
         outputs = dino_model(image_tensor,mask, input_ids=tokenized_out['input_ids'],
@@ -94,14 +131,14 @@ def main():
      # filter output
     logits_filt = logits.clone()
     boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(axis=1) > args.box_threshold
+    filt_mask = logits_filt.max(axis=1) > model_args.box_threshold
     logits_filt = logits_filt[filt_mask]  # num_filt, 256
     boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
 
      # build pred
     pred_phrases = []
     for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = processor.decode(logit > args.text_threshold)
+        pred_phrase = processor.decode(logit > model_args.text_threshold)
         pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
 
    
@@ -111,13 +148,13 @@ def main():
         "size": [size[1], size[0]],  # H,W
         "labels": pred_phrases,
     }
-    print("output:",pred_dict)
+    logger.info("output{}".format(pred_dict))
 
-    if args.visual:
+    if model_args.visual:
         # make dir
-        os.makedirs(args.output_dir, exist_ok=True)
+        os.makedirs(model_args.output_dir, exist_ok=True)
         image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
-        image_with_box.save(os.path.join(args.output_dir, "pred.jpg"))
+        image_with_box.save(os.path.join(model_args.output_dir, "pred.jpg"))
 
 
 if __name__ == "__main__":
diff --git a/paddlevlp/models/groundingdino/__init__.py b/paddlevlp/models/groundingdino/__init__.py
index d1ff79f33aafb8..2b7440cf1041e4 100644
--- a/paddlevlp/models/groundingdino/__init__.py
+++ b/paddlevlp/models/groundingdino/__init__.py
@@ -1,14 +1,8 @@
 # ------------------------------------------------------------------------
 # Grounding DINO
-# url: https://github.com/IDEA-Research/GroundingDINO
-# Copyright (c) 2023 IDEA. All Rights Reserved.
+# url: https://github.com/LokeZhou/PPGroundingDINO
+# Copyright (c) 2023 PaddlePaddle. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 # ------------------------------------------------------------------------
-# Conditional DETR
-# Copyright (c) 2021 Microsoft. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------
-# Copied from DETR (https://github.com/facebookresearch/detr)
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-# ------------------------------------------------------------------------
+
 
diff --git a/paddlevlp/models/groundingdino/backbone/backbone.py b/paddlevlp/models/groundingdino/backbone/backbone.py
index 397a1fc36b234f..e76785f1de57e9 100644
--- a/paddlevlp/models/groundingdino/backbone/backbone.py
+++ b/paddlevlp/models/groundingdino/backbone/backbone.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -70,7 +69,7 @@ def build_backbone(args):
     ]:
         pretrain_img_size = int(args.backbone.split("_")[-2])
         backbone = SwinTransformerModel.from_pretrained(
-            args.backbone,
+            "Swintransformer/"+args.backbone,
             pretrain_img_size=pretrain_img_size,
             out_indices=tuple(return_interm_indices),
             dilation=False,
diff --git a/paddlevlp/models/groundingdino/backbone/position_encoding.py b/paddlevlp/models/groundingdino/backbone/position_encoding.py
index 821b0fcc161a6b..f87d671d723e85 100644
--- a/paddlevlp/models/groundingdino/backbone/position_encoding.py
+++ b/paddlevlp/models/groundingdino/backbone/position_encoding.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,55 +22,6 @@
 import paddle.nn as nn
 from paddlenlp.utils.initializer import uniform_
 
-
-
-class PositionEmbeddingSine(nn.Layer):
-    """
-    This is a more standard version of the position embedding, very similar to the one
-    used by the Attention is all you need paper, generalized to work on images.
-    """
-
-    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
-        super().__init__()
-        self.num_pos_feats = num_pos_feats
-        self.temperature = temperature
-        self.normalize = normalize
-        if scale is not None and normalize is False:
-            raise ValueError("normalize should be True if scale is passed")
-        if scale is None:
-            scale = 2 * math.pi
-        self.scale = scale
-
-    def forward(self, mask:paddle.Tensor):
-   
-        assert mask is not None
-        not_mask = ~mask
-        y_embed = not_mask.astype(paddle.float32).cumsum(1)
-        x_embed = not_mask.astype(paddle.float32).cumsum(2)
-        if self.normalize:
-            eps = 1e-6
-            # if os.environ.get("SHILONG_AMP", None) == '1':
-            #     eps = 1e-4
-            # else:
-            #     eps = 1e-6
-            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
-
-        dim_t = 2 * (paddle.arange(self.num_pos_feats) // 2).astype(paddle.float32x)
-        dim_t = self.temperature ** (dim_t / self.num_pos_feats)
-
-        pos_x = x_embed[:, :, :, None] / dim_t
-        pos_y = y_embed[:, :, :, None] / dim_t
-        pos_x = paddle.stack(
-            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4
-        ).flatten(3)
-        pos_y = paddle.stack(
-            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4
-        ).flatten(3)
-        pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
-        return pos
-
-
 class PositionEmbeddingSineHW(nn.Layer):
     """
     This is a more standard version of the position embedding, very similar to the one
@@ -122,47 +72,9 @@ def forward(self, mask:paddle.Tensor):
         ).flatten(3)
         pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
 
-        # import ipdb; ipdb.set_trace()
-
         return pos
 
 
-class PositionEmbeddingLearned(nn.Layer):
-    """
-    Absolute pos embedding, learned.
-    """
-
-    def __init__(self, num_pos_feats=256):
-        super().__init__()
-        self.row_embed = nn.Embedding(50, num_pos_feats)
-        self.col_embed = nn.Embedding(50, num_pos_feats)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        uniform_(self.row_embed.weight)
-        uniform_(self.col_embed.weight)
-
-    def forward(self, x: paddle.Tensor):
-    
-        h, w = x.shape[-2:]
-        i = paddle.arange(w)
-        j = paddle.arange(h)
-        x_emb = self.col_embed(i)
-        y_emb = self.row_embed(j)
-        pos = (
-            paddle.concat(
-                [
-                    x_emb.unsqueeze(0).tile([h, 1, 1]),
-                    y_emb.unsqueeze(1).tile([1, w, 1]),
-                ],
-                axis=-1,
-            )
-            .transpose([2, 0, 1])
-            .unsqueeze(0)
-            .tile([x.shape[0], 1, 1, 1])
-        )
-        return pos
-
 
 def build_position_encoding(args):
     N_steps = args.hidden_dim // 2
diff --git a/paddlevlp/models/groundingdino/backbone/swin_transformer.py b/paddlevlp/models/groundingdino/backbone/swin_transformer.py
index cd636f1b7965d6..191bd6c1977d0b 100644
--- a/paddlevlp/models/groundingdino/backbone/swin_transformer.py
+++ b/paddlevlp/models/groundingdino/backbone/swin_transformer.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import numpy as np
+from typing import Union
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
@@ -26,140 +27,12 @@
 from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
 
 """ swin_transformer model configuration"""
-__all__ = ["SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION", "SwinTransformerConfig", "SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP"]
-
-
-SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION = {
-    "swin_T_224_1k": {
-        "in_chans": 3,
-        "embed_dim": 96, 
-        "depths": [2, 2, 6, 2], 
-        "num_heads": [3, 6, 12, 24],
-        "window_size": 7,
-        "pretrain_img_size": 224,
-        "patch_size": 4,
-        "out_indices": (0, 1, 2, 3),
-        "mlp_ratio": 4.0,
-        "qkv_bias": True,
-        "qk_scale": None,
-        "drop_rate": 0.0,
-        "attn_drop_rate": 0.0,
-        "drop_path_rate": 0.2,
-        "norm_layer": "LayerNorm",
-        "ape": False,
-        "patch_norm": True,
-        "frozen_stages": -1,
-        "dilation": False,
-        "use_checkpoint": False,
-
-
-    },
-    "swin_B_224_22k": {
-        "in_chans": 3,
-        "embed_dim": 128, 
-        "depths": [2, 2, 18, 2], 
-        "num_heads": [4, 8, 16, 32], 
-        "window_size": 7,
-        "pretrain_img_size": 224,
-        "patch_size": 4,
-        "out_indices": (0, 1, 2, 3),
-        "mlp_ratio": 4.0,
-        "qkv_bias": True,
-        "qk_scale": None,
-        "drop_rate": 0.0,
-        "attn_drop_rate": 0.0,
-        "drop_path_rate": 0.2,
-        "norm_layer": "LayerNorm",
-        "ape": False,
-        "patch_norm": True,
-        "frozen_stages": -1,
-        "dilation": False,
-        "use_checkpoint": False
-    },
-    "swin_B_384_22k": {
-        "in_chans": 3,
-        "embed_dim": 128, 
-        "depths": [2, 2, 18, 2], 
-        "num_heads": [4, 8, 16, 32], 
-        "window_size": 12,
-        "pretrain_img_size": 384,
-        "patch_size": 4,
-        "out_indices": (0, 1, 2, 3),
-        "mlp_ratio": 4.0,
-        "qkv_bias": True,
-        "qk_scale": None,
-        "drop_rate": 0.0,
-        "attn_drop_rate": 0.0,
-        "drop_path_rate": 0.2,
-        "norm_layer": "LayerNorm",
-        "ape": False,
-        "patch_norm": True,
-        "frozen_stages": -1,
-        "dilation": False,
-        "use_checkpoint":False
-    },
-    "swin_L_224_22k": {
-        "in_chans": 3,
-        "embed_dim": 192, 
-        "depths": [2, 2, 18, 2], 
-        "num_heads": [6, 12, 24, 48], 
-        "window_size": 7,
-        "pretrain_img_size": 224,
-        "patch_size": 4,
-        "out_indices": (0, 1, 2, 3),
-        "mlp_ratio": 4.0,
-        "qkv_bias": True,
-        "qk_scale": None,
-        "drop_rate": 0.0,
-        "attn_drop_rate": 0.0,
-        "drop_path_rate": 0.2,
-        "norm_layer": "LayerNorm",
-        "ape": False,
-        "patch_norm": True,
-        "frozen_stages": -1,
-        "dilation": False,
-        "use_checkpoint": False
-    },
-    "swin_L_384_22k":{
-        "in_chans": 3,
-        "embed_dim": 192, 
-        "depths": [2, 2, 18, 2],
-        "num_heads": [6, 12, 24, 48], 
-        "window_size": 12,
-        "pretrain_img_size": 384,
-        "patch_size": 4,
-        "out_indices": (0, 1, 2, 3),
-        "mlp_ratio": 4.0,
-        "qkv_bias": True,
-        "qk_scale": None,
-        "drop_rate": 0.0,
-        "attn_drop_rate": 0.0,
-        "drop_path_rate": 0.2,
-        "norm_layer": "LayerNorm",
-        "ape": False,
-        "patch_norm": True,
-        "frozen_stages": -1,
-        "dilation": False,
-        "use_checkpoint": False
-    },
-    
-}
-
-SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP = {
-    "model_state": {
-        "swin_T_224_1k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams",
-        "swin_B_224_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams",
-        "swin_B_384_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams",
-        "swin_L_224_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams",
-        "swin_L_384_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams",
-    }
-}
+__all__ = ["SwinTransformerConfig"]
 
 
 class SwinTransformerConfig(PretrainedConfig):
   
     model_type = "swintransformer"
-    pretrained_init_configuration = SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION
 
     def __init__(
         self,
@@ -182,9 +55,12 @@ def __init__(
         out_indices=(0, 1, 2, 3),
         frozen_stages=-1,
         dilation=False,
-        use_checkpoint=False
+        use_checkpoint=False,
+        **kwargs,
     ):
-        super().__init__()
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
         self.in_chans = in_chans
         self.embed_dim = embed_dim
         self.depths = depths
@@ -206,6 +82,27 @@ def __init__(
         self.dilation = dilation
         self.use_checkpoint = use_checkpoint
 
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
 class SwinTransformerPretrainedModel(PretrainedModel):
     """
     See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
@@ -216,10 +113,6 @@ class SwinTransformerPretrainedModel(PretrainedModel):
     resource_files_names = {"model_state": "model_state.pdparams"}
     base_model_prefix = "swintransformer"
 
-    pretrained_init_configuration = SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION
-    pretrained_resource_files_map = SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP
-
-
 class Mlp(nn.Layer):
     """Multilayer perceptron."""
 
diff --git a/paddlevlp/models/groundingdino/bert_model.py b/paddlevlp/models/groundingdino/bert_model.py
index e0cbf877fba3a9..339b8edf11f657 100644
--- a/paddlevlp/models/groundingdino/bert_model.py
+++ b/paddlevlp/models/groundingdino/bert_model.py
@@ -67,9 +67,6 @@ def forward(
         
         mixed_query_layer = self.query(hidden_states)
         
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
         is_cross_attention = encoder_hidden_states is not None
 
         if is_cross_attention and past_key_value is not None:
@@ -93,13 +90,6 @@ def forward(
         query_layer = self.transpose_for_scores(mixed_query_layer)
         # return query_layer,key_layer
         if self.is_decoder: # False
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_layer, value_layer)
        
         # Take the dot product between "query" and "key" to get the raw attention scores.
@@ -108,29 +98,20 @@ def forward(
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 
         if self.clamp_min_for_underflow:
-            attention_scores = paddle.clip(attention_scores, min=-50000) # Do not increase -50000, data type half has quite limited range
+            attention_scores = paddle.clip(attention_scores, min=-50000) 
         if self.clamp_max_for_overflow:
-            attention_scores = paddle.clip(attention_scores, max=50000) # Do not increase 50000, data type half has quite limited range
+            attention_scores = paddle.clip(attention_scores, max=50000) 
 
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+           
             attention_scores = attention_scores + attention_mask
 
-        # Normalize the attention scores to probabilities.
+      
         attention_probs = nn.Softmax(axis=-1)(attention_scores)
 
-        # if math.isnan(attention_probs.sum().item()):
-        #     for i in range(attention_probs.size(1)):
-        #         for j in range(attention_probs.size(2)):
-        #             if math.isnan(attention_probs[0, i, j].sum().item()):
-        #                 print(i, j)
-        #                 pdb.set_trace()
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = self.dropout(attention_probs)
 
-        # Mask heads if we want to
+        
         if head_mask is not None:
             attention_probs = attention_probs * head_mask
         
@@ -155,10 +136,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)  # diff 7.2274e-06
-        hidden_states = self.dropout(hidden_states) # diff 4.22e-05  
-        #                             hidden_states + input_tensor diff : 7.22e-6
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)  #diff 1.087e-05
+        hidden_states = self.dense(hidden_states)  
+        hidden_states = self.dropout(hidden_states)   
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)  
         return hidden_states
 
 
@@ -190,9 +170,6 @@ def forward(
         )  #pass    
         # return self_outputs
         attention_output = self.output(self_outputs[0], hidden_states)
-        # print(attention_output.shape, self_outputs[0].shape, len(self_outputs))
-        # attention_output 1.087e-05,  self_outputs 1.31e-06 , hidden_states 1.33e-08
-        # return attention_output, self_outputs, hidden_states
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
@@ -234,11 +211,9 @@ def __init__(self, config):
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
         self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+       
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer("position_ids", paddle.arange(config.max_position_embeddings).reshape((1, -1)))
         self.register_buffer(
@@ -263,9 +238,6 @@ def forward(
         if position_ids is None:
             position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
 
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
-        # issue #5664
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 buffered_token_type_ids = self.token_type_ids[:, :seq_length]
@@ -312,7 +284,7 @@ def forward(
         past_key_value = None,
         output_attentions = False,
     ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        
         self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         self_attention_outputs = self.attention(
             hidden_states,
@@ -338,7 +310,6 @@ def forward(
                     " by setting `config.add_cross_attention=True`"
                 )
 
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
             cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
             cross_attention_outputs = self.crossattention(
                 attention_output,
@@ -350,17 +321,15 @@ def forward(
                 output_attentions,
             )
             attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+            outputs = outputs + cross_attention_outputs[1:-1]  
 
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
             cross_attn_present_key_value = cross_attention_outputs[-1]
             present_key_value = present_key_value + cross_attn_present_key_value
 
         layer_output = self.feed_forward_chunk(attention_output)
-        # return layer_output, attention_output
+   
         outputs = (layer_output,) + outputs
 
-        # if decoder, return the attn key/values as the last output
         if self.is_decoder:
             outputs = outputs + (present_key_value,)
 
@@ -452,8 +421,7 @@ def __init__(self, config):
         self.activation = nn.Tanh()
 
     def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
+        
         first_token_tensor = hidden_states[:, 0]
         pooled_output = self.dense(first_token_tensor)
         pooled_output = self.activation(pooled_output)
@@ -481,8 +449,6 @@ def __init__(self, config, add_pooling_layer=True):
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config) if add_pooling_layer else None
 
-        # Initialize weights and apply final processing
-        # self.post_init() 
 
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
@@ -505,30 +471,22 @@ def get_extended_attention_mask(
             dtype = np.float32
 
         if not (attention_mask.dim() == 2 and self.config.is_decoder):
-            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
+           
             if device is not None:
                 warnings.warn(
                     "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
                 )
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
+   
         if attention_mask.dim() == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
         elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+           
             extended_attention_mask = attention_mask[:, None, None, :]
         else:
             raise ValueError(
                 f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
             )
 
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and the dtype's smallest value for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
         extended_attention_mask = paddle.cast(extended_attention_mask, dtype=dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * np.finfo(dtype).min
         return extended_attention_mask
@@ -611,12 +569,8 @@ def forward(
             else:
                 token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
 
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
         extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
 
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.config.is_decoder and encoder_hidden_states is not None:
             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
@@ -626,11 +580,6 @@ def forward(
         else:
             encoder_extended_attention_mask = None
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
         embedding_output = self.embeddings(
@@ -677,8 +626,6 @@ def __init__(self, cfg, bert_config):
         print("LANGUAGE BACKBONE USE GRADIENT CHECKPOINTING: ", self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT)
         bert_config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT
         
-        # bert_config.attention_probs_dropout_prob = 0.0
-        # bert_config.hidden_dropout_prob = 0.0
 
         self.model = BertModel(bert_config)
         self.language_dim = 768
diff --git a/paddlevlp/models/groundingdino/bertwarper.py b/paddlevlp/models/groundingdino/bertwarper.py
index d4c75bccdbe339..09904a2cd16e4f 100644
--- a/paddlevlp/models/groundingdino/bertwarper.py
+++ b/paddlevlp/models/groundingdino/bertwarper.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlevlp/models/groundingdino/configuration.py b/paddlevlp/models/groundingdino/configuration.py
index d39c42461b99d0..257539e2c4a8a8 100644
--- a/paddlevlp/models/groundingdino/configuration.py
+++ b/paddlevlp/models/groundingdino/configuration.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,69 +13,16 @@
 # limitations under the License.
 
 """ GroundingDino model configuration"""
-
+import os
+from typing import Union
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
 
-__all__ = ["GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION", "GroundingDinoConfig", "GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP"]
-
-GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION = {
-    "groundingdino-swint-ogc": {
-        "modelname" : "groundingdino",
-        "backbone" : "swin_T_224_1k",
-        "position_embedding" : "sine",
-        "pe_temperatureH" : 20,
-        "pe_temperatureW" : 20,
-        "return_interm_indices" : [1, 2, 3],
-        "backbone_freeze_keywords" : None,
-        "enc_layers" : 6,
-        "dec_layers" : 6,
-        "pre_norm" : False,
-        "dim_feedforward" : 2048,
-        "hidden_dim" : 256,
-        "dropout" : 0.0,
-        "nheads" : 8,
-        "num_queries" : 900,
-        "query_dim" : 4,
-        "num_patterns" : 0,
-        "num_feature_levels" : 4,
-        "enc_n_points" : 4,
-        "dec_n_points" : 4,
-        "two_stage_type" : "standard",
-        "two_stage_bbox_embed_share" : False,
-        "two_stage_class_embed_share" : False,
-        "transformer_activation" : "relu",
-        "dec_pred_bbox_embed_share" : True,
-        "dn_box_noise_scale" : 1.0,
-        "dn_label_noise_ratio" : 0.5,
-        "dn_label_coef" : 1.0,
-        "dn_bbox_coef" : 1.0,
-        "embed_init_tgt" :True,
-        "dn_labelbook_size" : 2000,
-        "max_text_len" : 256,
-        "text_encoder_type" : "bert-base-uncased",
-        "use_text_enhancer" : True,
-        "use_fusion_layer" : True,
-        "use_checkpoint" : False,
-        "use_transformer_ckpt" : False,
-        "use_text_cross_attention" : True,
-        "text_dropout" : 0.0,
-        "fusion_dropout" : 0.0,
-        "fusion_droppath" : 0.1,
-        "sub_sentence_present" : True
-    },
-}
-
-GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP = {
-    "model_state": {
-        "groundingdino-swint-ogc": "https://bj.bcebos.com/v1/paddledet/models/groundingdino_swint_ogc.pdparams",
-    }
-}
+__all__ = ["GroundingDinoConfig"]
 
 
 class GroundingDinoConfig(PretrainedConfig):
   
     model_type = "groundingdino"
-    pretrained_init_configuration = GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION
 
     def __init__(
         self,
@@ -121,9 +67,11 @@ def __init__(
         text_dropout = 0.0,
         fusion_dropout = 0.0,
         fusion_droppath = 0.1,
-        sub_sentence_present = True
+        sub_sentence_present = True,
+        **kwargs,
     ):
-        super().__init__()
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
         self.modelname = modelname
         self.backbone = backbone
         self.position_embedding = position_embedding
@@ -166,3 +114,24 @@ def __init__(
         self.fusion_dropout = fusion_dropout
         self.fusion_droppath = fusion_dropout
         self.sub_sentence_present = sub_sentence_present
+    
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
\ No newline at end of file
diff --git a/paddlevlp/models/groundingdino/fuse_modules.py b/paddlevlp/models/groundingdino/fuse_modules.py
index 0dc731cfa66e7d..2a3feb0b0844dd 100644
--- a/paddlevlp/models/groundingdino/fuse_modules.py
+++ b/paddlevlp/models/groundingdino/fuse_modules.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlevlp/models/groundingdino/modeling.py b/paddlevlp/models/groundingdino/modeling.py
index 11f5fcf76559cc..7504d9e7575056 100644
--- a/paddlevlp/models/groundingdino/modeling.py
+++ b/paddlevlp/models/groundingdino/modeling.py
@@ -35,11 +35,7 @@
     generate_masks_with_special_tokens_and_transfer_map,
 )
 
-from .configuration import (
-    GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION,
-    GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP,
-    GroundingDinoConfig,
-)
+from .configuration import GroundingDinoConfig
 from .backbone import build_backbone
 from .transformer import build_transformer
 
@@ -58,10 +54,8 @@ class GroundingDinoPretrainedModel(PretrainedModel):
     model_config_file = "config.json"
     config_class = GroundingDinoConfig
     resource_files_names = {"model_state": "model_state.pdparams"}
-    base_model_prefix = "groundding"
+    base_model_prefix = "grounddingDino"
 
-    pretrained_init_configuration = GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION
-    pretrained_resource_files_map = GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP
 
 @register_base_model
 class GroundingDinoModel(GroundingDinoPretrainedModel):
diff --git a/paddlevlp/models/groundingdino/transformer.py b/paddlevlp/models/groundingdino/transformer.py
index 697e6a90626eb9..034d836d2c5c10 100644
--- a/paddlevlp/models/groundingdino/transformer.py
+++ b/paddlevlp/models/groundingdino/transformer.py
@@ -60,7 +60,7 @@ def __init__(
         # init query
         learnable_tgt_init=False,
         # two stage
-        two_stage_type="no",  # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1']
+        two_stage_type="no",  
         embed_init_tgt=False,
         # for text
         use_text_enhancer=False,
@@ -155,7 +155,6 @@ def __init__(
         if num_feature_levels > 1:
             if self.num_encoder_layers > 0:
                 self.level_embed = self.create_parameter(shape=[num_feature_levels, d_model])
-                # self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
             else:
                 self.level_embed = None
 
@@ -278,9 +277,6 @@ def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None,
         # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
         #########################################################
         text_dict["encoded_text"] = memory_text
-        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
-        #     if memory.isnan().any() | memory.isinf().any():
-        #         import ipdb; ipdb.set_trace()
 
 
         if self.two_stage_type == "standard":
@@ -318,8 +314,6 @@ def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None,
     
             tgt_undetach = paddle.take_along_axis(arr=output_memory, axis=1,indices=topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, self.d_model]))
 
-            # gather tgt
-            # tgt_undetach = paddle.gather_nd(output_memory, topk_ind)
             if self.embed_init_tgt:
                 tgt_ = (
                     self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2])
@@ -550,9 +544,6 @@ def forward(
 
         # main process
         for layer_id, layer in enumerate(self.layers):
-            # if output.isnan().any() or memory_text.isnan().any():
-            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
-            #         import ipdb; ipdb.set_trace()
             if self.fusion_layers:
                 if self.use_checkpoint:
                     output, memory_text = recompute(
@@ -689,10 +680,7 @@ def forward(
             raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
             pos_scale = self.query_scale(output) if self.query_scale is not None else 1
             query_pos = pos_scale * raw_query_pos
-            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
-            #     if query_pos.isnan().any() | query_pos.isinf().any():
-            #         import ipdb; ipdb.set_trace()
-            
+         
             # main process
             output = layer(
                 tgt=output,
@@ -719,14 +707,10 @@ def forward(
                     print(f"num_nan {num_nan}, num_inf {num_inf}")
                 except Exception as e:
                     print(e)
-                    # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
-                    #     import ipdb; ipdb.set_trace()
+                  
 
             # iter update
             if self.bbox_embed is not None:
-                # box_holder = self.bbox_embed(output)
-                # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
-                # new_reference_points = box_holder[..., :self.query_dim].sigmoid()
 
                 reference_before_sigmoid = inverse_sigmoid(reference_points)
                 delta_unsig = self.bbox_embed[layer_id](output)
@@ -790,8 +774,7 @@ def forward_ffn(self, src):
     def forward(
         self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None
     ):
-        # self attention
-        # import ipdb; ipdb.set_trace()
+    
 
         src2 = self.self_attn(
             query=self.with_pos_embed(src, pos),
diff --git a/paddlevlp/models/groundingdino/utils.py b/paddlevlp/models/groundingdino/utils.py
index b55987720b5e4a..4f75874a47cb3d 100644
--- a/paddlevlp/models/groundingdino/utils.py
+++ b/paddlevlp/models/groundingdino/utils.py
@@ -94,13 +94,10 @@ def gen_encoder_output_proposals(
         else:
             wh = paddle.ones_like(grid) * 0.05 * (2.0**lvl)
 
-        # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
-        # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
-        # wh = torch.ones_like(grid) / scale
         proposal = paddle.concat((grid, wh), -1).reshape([N_, -1, 4])
         proposals.append(proposal)
         _cur += H_ * W_
-    # import ipdb; ipdb.set_trace()
+ 
     output_proposals = paddle.concat(proposals, 1)
     output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(
         -1, keepdim=True
@@ -113,8 +110,6 @@ def gen_encoder_output_proposals(
     output_memory = masked_fill(output_memory, memory_padding_mask.unsqueeze(-1), float(0))
     output_memory = masked_fill(output_memory, ~output_proposals_valid, float(0))
 
-    # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
-    # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
 
     return output_memory, output_proposals
 
@@ -204,8 +199,7 @@ def _get_activation_fn(activation, d_model=256, batch_dim=0):
 
 
 def gen_sineembed_for_position(pos_tensor):
-    # n_query, bs, _ = pos_tensor.size()
-    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+
     scale = 2 * math.pi
     dim_t = paddle.arange(128)
     dim_t = 10000 ** (2 * (paddle.floor_divide(dim_t, paddle.to_tensor(2))) / 128)

From 0592532ea6ba14dbc79d0d86f13484f665aee36e Mon Sep 17 00:00:00 2001
From: Milen <1649759610@qq.com>
Date: Tue, 4 Jul 2023 10:19:42 +0000
Subject: [PATCH 08/10] [New Feature] add visualglm

---
 paddlevlp/models/__init__.py                  |    4 +-
 paddlevlp/models/visualglm/__init__.py        |   13 +
 paddlevlp/models/visualglm/configuration.py   |  338 ++++
 paddlevlp/models/visualglm/modeling.py        | 1550 +++++++++++++++++
 paddlevlp/processors/__init__.py              |    2 +
 .../processors/visualglm_image_processing.py  |  285 +++
 paddlevlp/processors/visualglm_processing.py  |  223 +++
 7 files changed, 2414 insertions(+), 1 deletion(-)
 create mode 100644 paddlevlp/models/visualglm/__init__.py
 create mode 100644 paddlevlp/models/visualglm/configuration.py
 create mode 100644 paddlevlp/models/visualglm/modeling.py
 create mode 100644 paddlevlp/processors/visualglm_image_processing.py
 create mode 100644 paddlevlp/processors/visualglm_processing.py

diff --git a/paddlevlp/models/__init__.py b/paddlevlp/models/__init__.py
index 77ef10b5801c9c..967c36e525d711 100644
--- a/paddlevlp/models/__init__.py
+++ b/paddlevlp/models/__init__.py
@@ -15,4 +15,6 @@
 
 from .blip2.modeling import *
 from .minigpt4.configuration import *
-from .minigpt4.modeling import *
\ No newline at end of file
+from .minigpt4.modeling import *
+from .visualglm.configuration import *
+from .visualglm.modeling import *
\ No newline at end of file
diff --git a/paddlevlp/models/visualglm/__init__.py b/paddlevlp/models/visualglm/__init__.py
new file mode 100644
index 00000000000000..595add0aed9e11
--- /dev/null
+++ b/paddlevlp/models/visualglm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlevlp/models/visualglm/configuration.py b/paddlevlp/models/visualglm/configuration.py
new file mode 100644
index 00000000000000..36dae15687da6b
--- /dev/null
+++ b/paddlevlp/models/visualglm/configuration.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" VisualGLM model configuration """
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from paddlenlp.transformers.chatglm.configuration import ChatGLMConfig
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["VisualGLMVisionConfig", "VisualGLMQFormerConfig", "VisualGLMConfig"]
+
+
+class VisualGLMVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VisualGLMVisionModel`]. It is used to instantiate a
+    VisualGLM vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import VisualGLMVisionConfig, VisualGLMVisionModel
+    >>> # Initializing a VisualGLMVisionConfig
+    >>> configuration = VisualGLMVisionConfig()
+    >>> # Initializing a VisualGLMVisionModel (with random weights) from the configuration above.
+    >>> model = VisualGLMVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "visualglm_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from VisualGLMConfig
+        if config_dict.get("model_type") == "visualglm":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class VisualGLMQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VisualGLMQFormerModel`]. It is used to instantiate a
+    VisualGLM Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+    Note that [`VisualGLMQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import VisualGLMQFormerConfig, VisualGLMQFormerModel
+    >>> # Initializing a VisualGLM configuration
+    >>> configuration = VisualGLMQFormerConfig()
+    >>> # Initializing a model (with random weights) from the configuration above
+    >>> model = VisualGLMQFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "visualglm_qformer_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from VisualGLMConfig
+        if config_dict.get("model_type") == "visualglm":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class VisualGLMConfig(PretrainedConfig):
+    r"""
+    [`VisualGLMConfig`] is the configuration class to store the configuration of a [`VisualGLMForConditionalGeneration`]. It is
+    used to instantiate a VisualGLM model according to the specified arguments, defining the vision model, Q-Former model
+    and language model configs.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`VisualGLMVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`VisualGLMQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import (
+    ...     VisualGLMVisionConfig,
+    ...     VisualGLMQFormerConfig,
+    ...     ChatGLMConfig,
+    ...     VisualGLMConfig,
+    ...     VisualGLMForConditionalGeneration,
+    ... )
+    >>> # Initializing a VisualGLMConfig configuration
+    >>> configuration = VisualGLMConfig()
+    >>> # Initializing a VisualGLMForConditionalGeneration (with random weights) from the configuration above
+    >>> model = VisualGLMForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a VisualGLMConfig from a VisualGLMVisionConfig, VisualGLMQFormerConfig and any PretrainedConfig
+    >>> # Initializing VisualGLM vision, VisualGLM Q-Former and language model configurations
+    >>> vision_config = VisualGLMVisionConfig()
+    >>> qformer_config = VisualGLMQFormerConfig()
+    >>> text_config = ChatGLMConfig()
+    >>> config = VisualGLMConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "visualglm"
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the VisualGLMVisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the VisualGLMQFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`ChatGLMConfig`).")
+        self.vision_config = VisualGLMVisionConfig(**vision_config)
+        self.qformer_config = VisualGLMQFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "chatglm"
+
+        if text_model_type == "chatglm":
+            self.text_config = ChatGLMConfig(**text_config)
+        else:
+            raise ValueError("Only chatglm accepted for model_type, but accepted {}.".format(text_model_type))
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: VisualGLMVisionConfig,
+        qformer_config: VisualGLMQFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`VisualGLMConfig`] (or a derived class) from a vision model, Q-Former and language model
+        configurations.
+        Returns:
+            [`VisualGLM`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/paddlevlp/models/visualglm/modeling.py b/paddlevlp/models/visualglm/modeling.py
new file mode 100644
index 00000000000000..bd585984fcaafb
--- /dev/null
+++ b/paddlevlp/models/visualglm/modeling.py
@@ -0,0 +1,1550 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from paddlenlp.transformers.chatglm.configuration import ChatGLMConfig
+from paddlenlp.transformers.chatglm.modeling import ChatGLMForConditionalGeneration
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
+from paddlenlp.transformers.model_utils import (
+    PretrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+
+from ...utils.log import logger
+from ...activations import ACT2FN
+from ...utils.initializer import normal_, ones_, zeros_
+
+from .configuration import (
+    VisualGLMConfig,
+    VisualGLMQFormerConfig,
+    VisualGLMVisionConfig,
+)
+
+VisualGLM_PRETRAINED_MODEL_ARCHIVE_LIST = []
+
+__all__ = [
+    "VisualGLMModel",
+    "VisualGLMPretrainedModel",
+    "VisualGLMQFormerModel",
+    "VisualGLMVisionModel",
+    "VisualGLMForConditionalGeneration",
+]
+
+
+def Parameter(tensor, dtype="float16"):
+    tensor = paddle.cast(tensor, dtype)
+    return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor))
+
+
+@dataclass
+class VisualGLMForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`VisualGLMForConditionalGeneration`].
+    Args:
+        loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[paddle.Tensor]] = None
+    logits: Optional[Tuple[paddle.Tensor]] = None
+    vision_outputs: Optional[paddle.Tensor] = None
+    qformer_outputs: Optional[Tuple[paddle.Tensor]] = None
+    language_model_outputs: Optional[Tuple[paddle.Tensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class VisualGLMPretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VisualGLMConfig
+    base_model_prefix = "visualglm"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+    ]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            normal_(module.weight, mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                zeros_(module.bias)
+
+        if isinstance(module, VisualGLMVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
+            trunc_normal_(module.position_embedding)
+            trunc_normal_(
+                module.class_embedding,
+            )
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, VisualGLMEncoder):
+            module.gradient_checkpointing = value
+
+
+class VisualGLMVisionEmbeddings(nn.Layer):
+    def __init__(self, config: VisualGLMVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.in_channels = config.num_channels
+
+        self.patch_embedding = nn.Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.class_embedding = Parameter(paddle.randn([1, 1, self.embed_dim]), dtype=self.patch_embedding.weight.dtype)
+        self.position_embedding = Parameter(
+            paddle.randn([1, self.num_positions, self.embed_dim]), dtype=self.patch_embedding.weight.dtype
+        )
+
+    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
+
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
+        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
+        return embeddings
+
+
+class VisualGLMAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False)
+
+        if config.qkv_bias:
+            q_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype))
+            v_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias))
+            self.qkv.bias = Parameter(qkv_bias, dtype=self.qkv.weight.dtype)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.shape
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose(
+            [2, 0, 3, 1, 4]
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3])
+
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+class VisualGLMMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class VisualGLMEncoderLayer(nn.Layer):
+    def __init__(self, config: VisualGLMConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = VisualGLMAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = VisualGLMMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class VisualGLMEncoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`VisualGLMEncoderLayer`].
+    Args:
+        config (`VisualGLMConfig`):
+            The corresponding vision configuration for the `VisualGLMEncoder`.
+    """
+
+    def __init__(self, config: VisualGLMConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList([VisualGLMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class VisualGLMVisionModel(VisualGLMPretrainedModel):
+    main_input_name = "pixel_values"
+    config_class = VisualGLMVisionConfig
+
+    def __init__(self, config: VisualGLMVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = VisualGLMVisionEmbeddings(config)
+        self.encoder = VisualGLMEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class VisualGLMQFormerMultiHeadAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.shape[1]
+            position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1])
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = paddle.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class VisualGLMQFormerSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class VisualGLMQFormerAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = VisualGLMQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = VisualGLMQFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, axis=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class VisualGLMQFormerIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class VisualGLMQFormerOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        # hidden_states = self.LayerNorm()
+        return hidden_states
+
+
+class VisualGLMQFormerLayer(nn.Layer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.attention = VisualGLMQFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = VisualGLMQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = VisualGLMQFormerIntermediate(config)
+        self.output_query = VisualGLMQFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        hidden_states = self.input_layernorm(hidden_states)
+        self_attention_outputs = self.attention(
+            hidden_states,  # 1, 32, 768
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class VisualGLMQFormerEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList(
+            [VisualGLMQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class VisualGLMQFormerModel(VisualGLMPretrainedModel):
+    """
+    Querying Transformer (Q-Former), used in VisualGLM.
+    """
+
+    def __init__(self, config: VisualGLMQFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = VisualGLMQFormerEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`paddle.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+        Returns:
+            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.cast(dtype=self.config.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
+        """
+        Invert an attention mask (e.g., switches 0. and 1.).
+        Args:
+            encoder_attention_mask (`paddle.Tensor`): An attention mask.
+        Returns:
+            `paddle.Tensor`: The inverted attention mask.
+        """
+        if encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+        # /transformer/transformer_layers.py#L270
+        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+        # encoder_extended_attention_mask.transpose(-1, -2))
+        encoder_extended_attention_mask = encoder_extended_attention_mask.cast(
+            dtype=self.config.dtype
+        )  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+        return encoder_extended_attention_mask
+
+    def get_head_mask(
+        self, head_mask: Optional[paddle.Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> paddle.Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.ndim == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
+        elif head_mask.ndim == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.cast(dtype=self.config.dtype)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    def forward(
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.dropout(query_embeds)
+
+        input_shape = embedding_output.shape[:-1]
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.final_layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class VisualGLMModel(VisualGLMPretrainedModel):
+    config_class = VisualGLMConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: VisualGLMConfig):
+        super().__init__(config)
+
+        self.vision_model = VisualGLMVisionModel(config.vision_config)
+        self.query_tokens = Parameter(
+            paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]), dtype=self.config.dtype
+        )
+        self.qformer = VisualGLMQFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = ChatGLMForConditionalGeneration(config.text_config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from paddlenlp.transformers import ChatGLMTokenizer, VisualGLMModel
+        >>> tokenizer = ChatGLMTokenizer.from_pretrained("model_name")
+        >>> tokenizer.pad_token = tokenizer.eos_token
+        >>> model = VisualGLMModel.from_pretrained("model_name")
+        >>> model.eval()
+        >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False)
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return text_outputs
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import MinitGPT4Processor, VisualGLMModel
+        >>> processor = MinitGPT4Processor.from_pretrained("model_name")
+        >>> model = VisualGLMModel.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor.process_images(images=image, return_tensors="pd")
+        >>> image_outputs = model.get_image_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return vision_outputs
+
+    def get_qformer_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import MinitGPT4Processor, VisualGLMModel
+        >>> processor = MinitGPT4Processor.from_pretrained("model_name")
+        >>> model = VisualGLMModel.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor.process_images(images=image, return_tensors="pd")
+        >>> qformer_outputs = model.get_qformer_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        return query_outputs
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VisualGLMForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import VisualGLMProcessor, VisualGLMModel
+        >>> processor = VisualGLMProcessor.from_pretrained("model_name")
+        >>> model = VisualGLMModel.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.chatglm.transformer.word_embeddings(first_input_ids)
+        second_embeds = self.language_model.chatglm.word_embeddings(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones_like(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones_like(second_embeds.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(
+            [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1
+        )
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            logits = logits[:, -labels.shape[1] :, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+
+            loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return VisualGLMForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+class ChatGLMForConditionalGenerationWithImage(ChatGLMForConditionalGeneration):
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMForConditionalGenerationWithImage, self).__init__(config)
+        self.config = config
+
+    def forward(
+        self,
+        image_features: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        pre_image_length: Optional[int] = None,
+        cache: Optional[Tuple[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is None and cache is None and image_features is not None:
+            pre_ids, pad_ids, post_ids = paddle.split(input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1)
+            pre_txt_emb = self.chatglm.transformer.word_embeddings(pre_ids)
+            post_txt_emb = self.chatglm.transformer.word_embeddings(post_ids)
+            inputs_embeds = paddle.concat([pre_txt_emb, image_features, post_txt_emb], axis=1)
+
+        outputs = super().forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class VisualGLMForConditionalGeneration(VisualGLMPretrainedModel):
+    config_class = VisualGLMConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: VisualGLMConfig):
+        super().__init__(config)
+        self.config = config
+        self.vision_model = VisualGLMVisionModel(config.vision_config)
+        self.query_tokens = Parameter(
+            paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]), dtype=self.config.dtype
+        )
+        self.qformer = VisualGLMQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = ChatGLMForConditionalGenerationWithImage(config.text_config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def encode_images(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+    ):
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.final_layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.final_layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: mapping query_output into language_model space
+        language_model_inputs = self.language_projection(query_output)
+
+        return language_model_inputs
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        pre_image_length: int,
+        attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import VisualGLMProcessor, VisualGLMForConditionalGeneration
+        >>> processor = VisualGLMProcessor.from_pretrained("model_name")
+        >>> model = VisualGLMForConditionalGeneration.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> generated_ids, scores= model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        """
+
+        image_features = self.encode_images(pixel_values)
+
+        outputs = self.language_model.generate(
+            input_ids=input_ids,
+            image_features=image_features,
+            pre_image_length=pre_image_length,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
diff --git a/paddlevlp/processors/__init__.py b/paddlevlp/processors/__init__.py
index 04006999f0b629..a481ea97ee0bb0 100644
--- a/paddlevlp/processors/__init__.py
+++ b/paddlevlp/processors/__init__.py
@@ -16,3 +16,5 @@
 from .blip_processing import *
 from .minigpt4_processing import *
 from .minigpt4_image_processing import *
+from .visualglm_processing import *
+from .visualglm_image_processing import *
\ No newline at end of file
diff --git a/paddlevlp/processors/visualglm_image_processing.py b/paddlevlp/processors/visualglm_image_processing.py
new file mode 100644
index 00000000000000..920caf1df2c128
--- /dev/null
+++ b/paddlevlp/processors/visualglm_image_processing.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for VisualGLM."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from .image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+
+from paddlenlp.transformers.tokenizer_utils_base import TensorType
+
+__all__ = [
+    "VisualGLMImageProcessor",
+]
+
+
+class VisualGLMImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a VisualGLM image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+        default_image_std = [0.26862954, 0.26130258, 0.27577711]
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else default_image_mean
+        self.image_std = image_std if image_std is not None else default_image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=True)
+        output_size = (size["width"], size["height"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean.
+            std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` while preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/paddlevlp/processors/visualglm_processing.py b/paddlevlp/processors/visualglm_processing.py
new file mode 100644
index 00000000000000..60eb8ba0c2ed2f
--- /dev/null
+++ b/paddlevlp/processors/visualglm_processing.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for VisualGLM.
+"""
+
+import re
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+from PIL import Image
+
+from .image_processing_utils import BatchFeature
+from .image_utils import ImageInput
+from .base_processing import ProcessorMixin
+from  paddlenlp.transformers.tokenizer_utils_base import BatchEncoding, TensorType, TextInput
+
+__all__ = [
+    "VisualGLMProcessor",
+]
+
+
+class VisualGLMProcessor(ProcessorMixin):
+    r"""
+    Constructs a VisualGLM processor which wraps a VisualGLM image processor and an llama tokenizer into a single processor.
+    [`VisualGLMProcessor`] offers all the functionalities of [`VisualGLMImageProcessor`] and [`LlamaTokenizer`]. See the docstring
+    of [`~VisualGLMImageProcessor.__call__`] and [`~LlamaTokenizer.decode`] for more information.
+
+    Args:
+        image_processor (`VisualGLMImageProcessor`):
+            An instance of [`VisualGLMImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+
+    Examples:
+    ```python
+    >>> import requests
+    >>> from PIL import Image
+
+    >>> import paddle
+    >>> from paddlenlp.transformers import VisualGLMProcessor
+
+    >>> # load processor
+    >>> minigpt4_13b_path = "model_name"
+    >>> processor = VisualGLMProcessor.from_pretrained(minigpt4_13b_path)
+    >>> print("load processor and model done!")
+
+    >>> # prepare model inputs for VisualGLM
+    >>> url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> text = "describe this image"
+    >>> prompt = "Give the following image: <Img>ImageContent</Img>. You will be able to see the image once I provide it to you. Please answer my questions.###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+    >>> res = processor([image], text, prompt)
+    ```"""
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "VisualGLMImageProcessor"
+    tokenizer_class = "ChatGLMTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        tokenizer.model_input_names = ["input_ids", "attention_mask"]
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.default_prompt = "<img><ImageHere></img>"
+        self.image_tag = "<ImageHere>"
+        self.num_query_tokens = 32
+
+    def process_images(
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`VisualGLMImageProcessor.__call__`] method to prepare image(s) for the model.
+        Please refer to the docstring of the method for more information.
+        """
+        if not images:
+            raise ValueError("You have to input correct images.")
+
+        if isinstance(images, (Image.Image, np.ndarray, paddle.Tensor)):
+            images = [images]
+
+        processed_images = self.image_processor(images, return_tensors=return_tensors)
+
+        return processed_images
+
+    def process_texts(
+        self,
+        texts: Union[TextInput, List[TextInput]],
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchEncoding:
+        if not texts:
+            raise ValueError("You have to input correct texts.")
+
+        if isinstance(texts, TextInput):
+            texts = [texts]
+
+        processed_texts = self.tokenizer(text=texts, return_tensors=return_tensors, **kwargs)
+        return BatchEncoding(processed_texts)
+
+    def build_inputs_with_image(
+        self,
+        image: Union[Image.Image, np.ndarray, paddle.Tensor],
+        query: str,
+        history: Optional[str] = None,
+    ):
+        # construct prompt with inputs
+        if image is not None:
+            prompt = self.default_prompt
+        else:
+            prompt = ""
+        for old_query, response in history:
+            prompt += "问：{}\n答：{}\n".format(old_query, response)
+        prompt += "问：{}\n答：".format(query)
+
+        if image is not None:
+            image_start_position = prompt.rfind(self.image_tag)
+            image_end_position = image_start_position + len(self.image_tag)
+            first_text_input = self.tokenizer.encode(prompt[:image_start_position], add_special_tokens=False)
+            image_input = [self.tokenizer.unk_token_id] * self.num_query_tokens
+            second_text_input = self.tokenizer.encode(prompt[image_end_position:], add_special_tokens=False)
+            all_input_ids = first_text_input["input_ids"] + image_input + second_text_input["input_ids"]
+            all_input_ids = self.tokenizer.build_inputs_with_special_tokens(all_input_ids)
+
+            # processing image
+            processed_image = self.process_images(image)
+
+            inputs = {
+                "input_ids": paddle.to_tensor(all_input_ids, dtype="int64").unsqueeze(0),
+                "pre_image_length": len(first_text_input["input_ids"]),
+                "pixel_values": processed_image["pixel_values"],
+            }
+        else:
+            inputs = self.tokenizer([prompt], return_tensors="pd")
+            inputs["pre_image_length"] = 0
+
+        return inputs
+
+    def __call__(
+        self,
+        image: Union[Image.Image, np.ndarray, paddle.Tensor],
+        query: str,
+        history: Optional[str] = [],
+        **kwargs,
+    ):
+        if image is None:
+            raise ValueError("Image should not be None.")
+        if query is None:
+            raise ValueError("Query should not be None.")
+        if not isinstance(query, str):
+            raise TypeError("A string type of query is expected, but acceived {}.".format(type(query)))
+        if not isinstance(history, list):
+            raise TypeError(
+                "A list type of history is expected with each item [query, response] in it, but acceived {}.".format(
+                    type(history)
+                )
+            )
+
+        inputs = self.build_inputs_with_image(image, query, history=history)
+
+        return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        punkts = [
+            [",", "，"],
+            ["!", "！"],
+            [":", "："],
+            [";", "；"],
+            ["\?", "？"],
+        ]
+        for item in punkts:
+            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+        return response
+
+    def get_responses(self, *args, **kwargs):
+        processed_responses = []
+        responses = self.batch_decode(*args, **kwargs)
+
+        for response in responses:
+            response = self.process_response(response)
+            processed_responses.append(response)
+
+        return processed_responses
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

From 1bcb2707f391d77200332552448b1986d474cf0b Mon Sep 17 00:00:00 2001
From: Milen <1649759610@qq.com>
Date: Wed, 5 Jul 2023 03:19:01 +0000
Subject: [PATCH 09/10] update examples for visualglm

---
 paddlevlp/examples/visualglm/README.md      | 29 +++++++++------------
 paddlevlp/examples/visualglm/run_predict.py |  1 +
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/paddlevlp/examples/visualglm/README.md b/paddlevlp/examples/visualglm/README.md
index 5c767ecacf40f5..a0c0047224cf32 100644
--- a/paddlevlp/examples/visualglm/README.md
+++ b/paddlevlp/examples/visualglm/README.md
@@ -12,39 +12,34 @@ VisualGLM-6B 依靠来自于 CogView 数据集的30M高质量中文图文对，
 
 ```
 python run_predict.py \
-    -- pretrained_name_or_path "your minigpt4 path"
+    -- pretrained_name_or_path "THUDM/visualglm-6b"
 
 ```
 
 下图这个示例展示了在使用visualglm-6b时的效果：
 
-输入图片：<center><img src="https://github.com/PaddlePaddle/Paddle/assets/35913314/d8070644-4713-465d-9c7e-9585024c1819" /></center>
+输入图片：<center><img src="https://github.com/PaddlePaddle/PaddleNLP/assets/35913314/b609132e-1c06-4e4c-b5b3-7d945cfe8839" /></center>
 
 输入文本：“写诗描述一下这个场景”
 
 输出:
 ```
-两个杯子，黑白相间，
-一个放在桌子上，另一个放在咖啡杯上。
-它们静静地坐着，
-仿佛在讲述着什么故事。 一只猫和另一只猫，
-彼此相依相伴，
-似乎有着某种神秘的联系。
-它们的黑白对比，
-仿佛是一幅美丽的画，
-让人不禁沉醉其中。 这两只杯子，
-是一份温馨的礼物，
-代表着爱和情感的温度。
-它们在桌面上静静等待着，
-期待着主人的到来，
-让它们成为彼此的依靠。
+泰坦尼克号，浪漫而美丽。
+男女主角手牵手，共舞于船头。
+夕阳余晖洒落，风景如画。
+他们的身影如此优美，令人陶醉。 海水翻涌，波涛汹涌。
+船上的人们，沉浸在这美妙的时刻中。
+爱情的力量，让他们更加坚定。
+他们在大海上翱翔，享受着彼此的温暖。 电影的结束，意味着爱情的开始。
+他们将永远铭记这段美好的日子。
+在回忆里，他们会珍惜这份爱。
 ```
 
 输入文本：“这部电影的导演是谁？”
 
 输出:
 ```
-电影《猫与杯》由韩国著名导演李在均执导。
+这部电影的导演是詹姆斯·卡梅隆(James Cameron)。
 ```
 
 ## 3. License 说明
diff --git a/paddlevlp/examples/visualglm/run_predict.py b/paddlevlp/examples/visualglm/run_predict.py
index f12c32c593f2a9..460dcda8f8aa24 100644
--- a/paddlevlp/examples/visualglm/run_predict.py
+++ b/paddlevlp/examples/visualglm/run_predict.py
@@ -31,6 +31,7 @@ def predict(args):
     print("load processor and model done!")
 
     url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+    url = "https://paddlenlp.bj.bcebos.com/data/images/titanic.jpeg"
     image = Image.open(requests.get(url, stream=True).raw)
     generate_kwargs = {
         "max_length":1024, 

From a7014f8c609c0881a5193a54c4df5be91afcc26d Mon Sep 17 00:00:00 2001
From: Milen <1649759610@qq.com>
Date: Wed, 5 Jul 2023 03:53:57 +0000
Subject: [PATCH 10/10] fix license link

---
 paddlevlp/examples/visualglm/README.md      | 2 +-
 paddlevlp/examples/visualglm/run_predict.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddlevlp/examples/visualglm/README.md b/paddlevlp/examples/visualglm/README.md
index a0c0047224cf32..81c2321b304bed 100644
--- a/paddlevlp/examples/visualglm/README.md
+++ b/paddlevlp/examples/visualglm/README.md
@@ -43,7 +43,7 @@ python run_predict.py \
 ```
 
 ## 3. License 说明
-VisualGLM-6B模型权重使用需要遵循清华大学发布的[Model License](./MODEL_LICENSE.txt)。
+VisualGLM-6B模型权重使用需要遵循清华大学发布的[Model License](https://github.com/THUDM/VisualGLM-6B/blob/main/MODEL_LICENSE.txt)。
 
 
 ## Reference
diff --git a/paddlevlp/examples/visualglm/run_predict.py b/paddlevlp/examples/visualglm/run_predict.py
index 460dcda8f8aa24..560ea5bc559828 100644
--- a/paddlevlp/examples/visualglm/run_predict.py
+++ b/paddlevlp/examples/visualglm/run_predict.py
@@ -30,7 +30,6 @@ def predict(args):
     processor = VisualGLMProcessor.from_pretrained(args.pretrained_name_or_path)
     print("load processor and model done!")
 
-    url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
     url = "https://paddlenlp.bj.bcebos.com/data/images/titanic.jpeg"
     image = Image.open(requests.get(url, stream=True).raw)
     generate_kwargs = {