Add efficientdet models (#67)

lolipopshock · web-flow · commit b4b4feae100b · 2021-09-09T12:44:59.000-04:00
* add effdet check

* Add effdet models and catalogs

* clean-up

* register effdet models

* Add generalized image type support for layout models

* Add effdet tests

* Update reqs
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,4 +1,5 @@
 pytest
+torch
 numpy
 opencv-python
 pandas
@@ -11,4 +12,5 @@ google-cloud-vision==1
 pytesseract
 pycocotools
 git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2
-paddlepaddle
+paddlepaddle
+effdet
diff --git a/setup.py b/setup.py
@@ -24,7 +24,6 @@
         "pandas",
         "pillow",
         "pyyaml>=5.1",
-        "torch",
         "torchvision",
         "iopath",
       ],
@@ -33,6 +32,10 @@
           'google-cloud-vision==1',
           'pytesseract'
         ], 
+        "effdet": [
+          "torch",
+          "effdet"
+        ]
       },
       include_package_data=True
       )
diff --git a/src/layoutparser/__init__.py b/src/layoutparser/__init__.py
@@ -6,6 +6,7 @@
     _LazyModule,
     is_detectron2_available,
     is_paddle_available,
+    is_effdet_available,
     is_pytesseract_available,
     is_gcv_available,
 )
@@ -45,6 +46,9 @@
 if is_paddle_available():
     _import_structure["models.paddledetection"] = ["PaddleDetectionLayoutModel"]
 
+if is_effdet_available():
+    _import_structure["models.effdet"] = ["EfficientDetLayoutModel"]
+
 if is_pytesseract_available():
     _import_structure["ocr.tesseract_agent"] = [
         "TesseractAgent",
diff --git a/src/layoutparser/file_utils.py b/src/layoutparser/file_utils.py
@@ -39,11 +39,18 @@
     # The name of the paddlepaddle library:
     # Install name: pip install paddlepaddle
     # Import name: import paddle
-    _paddle_version = importlib_metadata.version("paddlepaddle") 
+    _paddle_version = importlib_metadata.version("paddlepaddle")
     logger.debug(f"Paddle version {_paddle_version} available.")
 except importlib_metadata.PackageNotFoundError:
     _paddle_available = False
 
+_effdet_available = importlib.util.find_spec("effdet") is not None
+try:
+    _effdet_version = importlib_metadata.version("effdet")
+    logger.debug(f"Effdet version {_effdet_version} available.")
+except importlib_metadata.PackageNotFoundError:
+    _effdet_version = False
+
 ###########################################
 ############## OCR Tool Deps ##############
 ###########################################
@@ -78,12 +85,16 @@ def is_torch_cuda_available():
         return False
 
 
+def is_detectron2_available():
+    return _detectron2_available
+
+
 def is_paddle_available():
     return _paddle_available
 
 
-def is_detectron2_available():
-    return _detectron2_available
+def is_effdet_available():
+    return _effdet_available
 
 
 def is_pytesseract_available():
@@ -111,6 +122,11 @@ def is_gcv_available():
 installation page: https://github.com/PaddlePaddle/Paddle and follow the ones that match your environment.
 """
 
+EFFDET_IMPORT_ERROR = """
+{0} requires the effdet library but it was not found in your environment. You can install it with pip:
+`pip install effdet`
+"""
+
 PYTESSERACT_IMPORT_ERROR = """
 {0} requires the PyTesseract library but it was not found in your environment. You can install it with pip:
 `pip install pytesseract`
@@ -126,6 +142,7 @@ def is_gcv_available():
         ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
         ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
         ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)),
+        ("effdet", (is_effdet_available, )),
         ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
         ("google-cloud-vision", (is_gcv_available, GCV_IMPORT_ERROR)),
     ]
@@ -172,7 +189,7 @@ def __init__(
         self._import_structure = import_structure
 
         # Following [PEP 366](https://www.python.org/dev/peps/pep-0366/)
-        # The __package__ variable should be set 
+        # The __package__ variable should be set
         # https://docs.python.org/3/reference/import.html#__package__
         self.__package__ = self.__name__
 
@@ -198,4 +215,4 @@ def _get_module(self, module_name: str):
         return importlib.import_module("." + module_name, self.__name__)
 
     def __reduce__(self):
-        return (self.__class__, (self._name, self.__file__, self._import_structure))
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
diff --git a/src/layoutparser/models/__init__.py b/src/layoutparser/models/__init__.py
@@ -1,2 +1,3 @@
 from .detectron2.layoutmodel import Detectron2LayoutModel
-from .paddledetection.layoutmodel import PaddleDetectionLayoutModel
+from .paddledetection.layoutmodel import PaddleDetectionLayoutModel
+from .effdet.layoutmodel import EfficientDetLayoutModel
diff --git a/src/layoutparser/models/base_layoutmodel.py b/src/layoutparser/models/base_layoutmodel.py
@@ -1,17 +1,23 @@
+from typing import Union
 from abc import ABC, abstractmethod
 
 from ..file_utils import requires_backends
 
 
 class BaseLayoutModel(ABC):
-    
     @property
     @abstractmethod
     def DETECTOR_NAME(self):
         pass
-    
+
+    @abstractmethod
+    def detect(self, image):
+        pass
+
     @abstractmethod
-    def detect(self):
+    def image_loader(self, image: Union["ndarray", "Image"]):
+        """It will process the input images appropriately to the target format. 
+        """
         pass
 
     # Add lazy loading mechanisms for layout models, refer to
diff --git a/src/layoutparser/models/detectron2/layoutmodel.py b/src/layoutparser/models/detectron2/layoutmodel.py
@@ -1,3 +1,4 @@
+from typing import Union
 from PIL import Image
 import numpy as np
 
@@ -41,7 +42,7 @@ class Detectron2LayoutModel(BaseLayoutModel):
 
     Examples::
         >>> import layoutparser as lp
-        >>> model = lp.models.Detectron2LayoutModel('lp://HJDataset/faster_rcnn_R_50_FPN_3x/config')
+        >>> model = lp.Detectron2LayoutModel('lp://HJDataset/faster_rcnn_R_50_FPN_3x/config')
         >>> model.detect(image)
 
     """
@@ -108,7 +109,7 @@ def _reconstruct_path_with_detector_name(self, path: str) -> str:
             model_name_segments = model_name.split("/")
             if (
                 len(model_name_segments) == 3
-                and "detectron2" not in model_name_segments
+                and self.DETECTOR_NAME not in model_name_segments
             ):
                 return "lp://" + self.DETECTOR_NAME + "/" + path[len("lp://") :]
         return path
@@ -148,12 +149,16 @@ def detect(self, image):
             :obj:`~layoutparser.Layout`: The detected layout of the input image
         """
 
+        image = self.image_loader(image)
+        outputs = self.model(image)
+        layout = self.gather_output(outputs)
+        return layout
+
+    def image_loader(self, image: Union["np.ndarray", "Image.Image"]):
         # Convert PIL Image Input
         if isinstance(image, Image.Image):
             if image.mode != "RGB":
                 image = image.convert("RGB")
             image = np.array(image)
 
-        outputs = self.model(image)
-        layout = self.gather_output(outputs)
-        return layout
+        return image
diff --git a/src/layoutparser/models/effdet/__init__.py b/src/layoutparser/models/effdet/__init__.py
@@ -0,0 +1,2 @@
+from . import catalog as _UNUSED
+from .layoutmodel import EfficientDetLayoutModel
diff --git a/src/layoutparser/models/effdet/catalog.py b/src/layoutparser/models/effdet/catalog.py
@@ -0,0 +1,53 @@
+from iopath.common.file_io import PathHandler
+
+from ..base_catalog import PathManager
+
+MODEL_CATALOG = {
+    "PubLayNet": {
+        "tf_efficientdet_d0": "https://www.dropbox.com/s/ukbw5s673633hsw/publaynet-tf_efficientdet_d0.pth.tar?dl=1",
+        "tf_efficientdet_d1": "https://www.dropbox.com/s/gxy11xkkiwnpgog/publaynet-tf_efficientdet_d1.pth.tar?dl=1"
+    },
+    "MFD": {
+        "tf_efficientdet_d0": "https://www.dropbox.com/s/dkr22iux7thlhel/mfd-tf_efficientdet_d0.pth.tar?dl=1",
+        "tf_efficientdet_d1": "https://www.dropbox.com/s/icmbiaqr5s9bz1x/mfd-tf_efficientdet_d1.pth.tar?dl=1"
+    }
+}
+
+# In effdet training scripts, it requires the label_map starting
+# from 1 instead of 0
+LABEL_MAP_CATALOG = {
+    "PubLayNet": {
+        1: "Text", 
+        2: "Title", 
+        3: "List", 
+        4: "Table", 
+        5: "Figure"
+    }
+}
+
+class LayoutParserEfficientDetModelHandler(PathHandler):
+    """
+    Resolve anything that's in LayoutParser model zoo.
+    """
+
+    PREFIX = "lp://efficientdet/"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path, **kwargs):
+        model_name = path[len(self.PREFIX) :]
+
+        dataset_name, *model_name, data_type = model_name.split("/")
+
+        if data_type == "weight":
+            model_url = MODEL_CATALOG[dataset_name]["/".join(model_name)]
+        else:
+            raise ValueError(f"Unknown data_type {data_type}")
+        return PathManager.get_local_path(model_url, **kwargs)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+PathManager.register_handler(LayoutParserEfficientDetModelHandler())
diff --git a/src/layoutparser/models/effdet/layoutmodel.py b/src/layoutparser/models/effdet/layoutmodel.py
diff --git a/src/layoutparser/models/paddledetection/layoutmodel.py b/src/layoutparser/models/paddledetection/layoutmodel.py
diff --git a/tests/test_model.py b/tests/test_model.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from . import catalog as _UNUSED`
	`2`	`+from .layoutmodel import EfficientDetLayoutModel`
-Original file line number
+Diff line change
 +from typing import List, Optional, Union, Dict, Any, Tuple
++
 +from PIL import Image
 +import numpy as np
++
 +from .catalog import PathManager, LABEL_MAP_CATALOG
 +from ..base_layoutmodel import BaseLayoutModel
 +from ...elements import Rectangle, TextBlock, Layout
++
 +from ...file_utils import is_effdet_available, is_torch_cuda_available
++
 +if is_effdet_available():
 +    import torch
 +    from effdet import create_model
 +    from effdet.data.transforms import (
 +        IMAGENET_DEFAULT_MEAN,
 +        IMAGENET_DEFAULT_STD,
 +        transforms_coco_eval,
 +    )
++
++
 +class InputTransform:
 +    def __init__(
 +        self,
 +        image_size,
 +        mean=IMAGENET_DEFAULT_MEAN,
 +        std=IMAGENET_DEFAULT_STD,
 +    ):
++
 +        self.mean = mean
 +        self.std = std
++
 +        self.transform = transforms_coco_eval(
 +            image_size,
 +            interpolation="bilinear",
 +            use_prefetcher=True,
 +            fill_color="mean",
 +            mean=self.mean,
 +            std=self.std,
 +        )
++
 +        self.mean_tensor = torch.tensor([x * 255 for x in mean]).view(1, 3, 1, 1)
 +        self.std_tensor = torch.tensor([x * 255 for x in std]).view(1, 3, 1, 1)
++
 +    def preprocess(self, image: Image) -> Tuple[torch.Tensor, Dict]:
++
 +        image = image.convert("RGB")
 +        image_info = {"img_size": image.size}
++
 +        input, image_info = self.transform(image, image_info)
 +        image_info = {
 +            key: torch.tensor(val).unsqueeze(0) for key, val in image_info.items()
 +        }
++
 +        input = torch.tensor(input).unsqueeze(0)
 +        input = input.float().sub_(self.mean_tensor).div_(self.std_tensor)
++
 +        return input, image_info
++
++
 +class EfficientDetLayoutModel(BaseLayoutModel):
 +    """Create a EfficientDet-based Layout Detection Model
++
 +    Args:
 +        config_path (:obj:`str`):
 +            The path to the configuration file.
 +        model_path (:obj:`str`, None):
 +            The path to the saved weights of the model.
 +            If set, overwrite the weights in the configuration file.
 +            Defaults to `None`.
 +        label_map (:obj:`dict`, optional):
 +            The map from the model prediction (ids) to real
 +            word labels (strings). If the config is from one of the supported
 +            datasets, Layout Parser will automatically initialize the label_map.
 +            Defaults to `None`.
 +        enforce_cpu(:obj:`bool`, optional):
 +            When set to `True`, it will enforce using cpu even if it is on a CUDA
 +            available device.
 +        extra_config (:obj:`dict`, optional):
 +            Extra configuration passed to the EfficientDet model
 +            configuration. Currently supported arguments:
 +                num_classes: specifying the number of classes for the models
 +                output_confidence_threshold: minmum object prediction confidence to retain
++
 +    Examples::
 +        >>> import layoutparser as lp
 +        >>> model = lp.EfficientDetLayoutModel("lp://PubLayNet/tf_efficientdet_d0/config")
 +        >>> model.detect(image)
++
 +    """
++
 +    DEPENDENCIES = ["effdet"]
 +    DETECTOR_NAME = "efficientdet"
++
 +    DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD = 0.25
++
 +    def __init__(
 +        self,
 +        config_path: str,
 +        model_path: str = None,
 +        label_map: Optional[Dict] = None,
 +        extra_config: Optional[Dict] = None,
 +        enforce_cpu: bool = False,
 +        device: str = None,
 +    ):
++
 +        if is_torch_cuda_available():
 +            if device is None:
 +                device = "cuda"
 +        else:
 +            device = "cpu"
 +        self.device = device
++
 +        extra_config = extra_config if extra_config is not None else {}
++
 +        self._initialize_model(config_path, model_path, label_map, extra_config)
++
 +        self.output_confidence_threshold = extra_config.get(
 +            "output_confidence_threshold", self.DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD
 +        )
++
 +        self.preprocessor = InputTransform(self.config.image_size)
++
 +    def _initialize_model(
 +        self,
 +        config_path: str,
 +        model_path: Optional[str],
 +        label_map: Optional[Dict],
 +        extra_config: Optional[Dict],
 +    ):
++
 +        if config_path.startswith("lp://"):
 +            # If it's officially supported by layoutparser
 +            dataset_name, model_name = config_path.lstrip("lp://").split("/")[0:2]
++
 +            if label_map is None:
 +                label_map = LABEL_MAP_CATALOG[dataset_name]
 +            num_classes = len(label_map)
++
 +            if model_path is None:
 +                # Download the models when it model_path is not specified
 +                model_path = PathManager.get_local_path(
 +                    self._reconstruct_path_with_detector_name(
 +                        config_path.replace("config", "weight")
 +                    )
 +                )
++
 +            self.model = create_model(
 +                model_name,
 +                num_classes=num_classes,
 +                bench_task="predict",
 +                pretrained=True,
 +                checkpoint_path=model_path,
 +            )
 +        else:
 +            assert (
 +                model_path is not None
 +            ), f"When the specified model is not layoutparser-based, you need to specify the model_path"
++
 +            assert (
 +                label_map is not None or "num_classes" in extra_config
 +            ), "When the specified model is not layoutparser-based, you need to specify the label_map or add num_classes in the extra_config"
++
 +            model_name = config_path
 +            model_path = PathManager.get_local_path(
 +                model_path
 +            )  # It might be an https URL
++
 +            num_classes = len(label_map) if label_map else extra_config["num_classes"]
++
 +            self.model = create_model(
 +                model_name,
 +                num_classes=num_classes,
 +                bench_task="predict",
 +                pretrained=True,
 +                checkpoint_path=model_path,
 +            )
++
 +        self.model.to(self.device)
 +        self.model.eval()
 +        self.config = self.model.config
 +        self.label_map = label_map if label_map is not None else {}
++
 +    def _reconstruct_path_with_detector_name(self, path: str) -> str:
 +        """This function will add the detector name (efficientdet) into the
 +        lp model config path to get the "canonical" model name.
++
 +        Args:
 +            path (str): The given input path that might or might not contain the detector name.
++
 +        Returns:
 +            str: a modified path that contains the detector name.
 +        """
 +        if path.startswith("lp://"):  # TODO: Move "lp://" to a constant
 +            model_name = path[len("lp://") :]
 +            model_name_segments = model_name.split("/")
 +            if (
 +                len(model_name_segments) == 3
 +                and self.DETECTOR_NAME not in model_name_segments
 +            ):
 +                return "lp://" + self.DETECTOR_NAME + "/" + path[len("lp://") :]
 +        return path
++
 +    def detect(self, image: Union["np.ndarray", "Image.Image"]):
++
 +        image = self.image_loader(image)
++
 +        model_inputs, image_info = self.preprocessor.preprocess(image)
++
 +        model_outputs = self.model(
 +            model_inputs.to(self.device),
 +            {key: val.to(self.device) for key, val in image_info.items()},
 +        )
++
 +        layout = self.gather_output(model_outputs)
 +        return layout
++
 +    def gather_output(self, model_outputs: torch.Tensor) -> Layout:
++
 +        model_outputs = model_outputs.cpu().detach()
 +        box_predictions = Layout()
++
 +        for index, sample in enumerate(model_outputs):
 +            sample[:, 2] -= sample[:, 0]
 +            sample[:, 3] -= sample[:, 1]
++
 +            for det in sample:
++
 +                score = float(det[4])
 +                pred_cat = int(det[5])
 +                x, y, w, h = det[0:4].tolist()
++
 +                if (
 +                    score < self.output_confidence_threshold
 +                ):  # stop when below this threshold, scores in descending order
 +                    break
++
 +                box_predictions.append(
 +                    TextBlock(
 +                        block=Rectangle(x, y, w + x, h + y),
 +                        score=score,
 +                        id=index,
 +                        type=self.label_map.get(pred_cat, pred_cat),
 +                    )
 +                )
++
 +        return box_predictions
++
 +    def image_loader(self, image: Union["np.ndarray", "Image.Image"]):
++
 +        # Convert cv2 Image Input
 +        if isinstance(image, np.ndarray):
 +            # In this case, we assume the image is loaded by cv2
 +            # and the channel order is BGR
 +            image = image[..., ::-1]
 +            image = Image.fromarray(image, mode="RGB")
++
 +        return image
-Original file line number
+Diff line change
     "lp://TableBank/ppyolov2_r50vd_dcn_365e_tableBank_latex/config",
+]
 +ALL_EFFDET_MODEL_CONFIGS = [
 +    "lp://PubLayNet/tf_efficientdet_d0/config",
 +    "lp://PubLayNet/tf_efficientdet_d1/config",
 +    "lp://MFD/tf_efficientdet_d0/config",
 +    "lp://MFD/tf_efficientdet_d1/config",
 +]
++
 def test_Detectron2Model(is_large_scale=False):
     if is_large_scale:
     # Test in enforce CPU mode
     model = PaddleDetectionLayoutModel("lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config", enforce_cpu=True)
     image = cv2.imread("tests/fixtures/model/test_model_image.jpg")
 -    layout = model.detect(image)
 +    layout = model.detect(image)
++
 +def test_EffDetModel(is_large_scale=False):
++
 +    if is_large_scale:
++
 +        for config in ALL_EFFDET_MODEL_CONFIGS:
 +            model = EfficientDetLayoutModel(config)
++
 +            image = cv2.imread("tests/fixtures/model/test_model_image.jpg")
 +            layout = model.detect(image)
 +    else:
 +        model = EfficientDetLayoutModel("lp://PubLayNet/tf_efficientdet_d0/config")
 +        image = cv2.imread("tests/fixtures/model/test_model_image.jpg")
 +        layout = model.detect(image)