BVLC · shelhamer · Jun 11, 2014 · Jun 9, 2014 · Jun 10, 2014 · Jun 9, 2014
diff --git a/docs/getting_pretrained_models.md b/docs/getting_pretrained_models.md
@@ -24,4 +24,6 @@ This page will be updated as more models become available.
 - The best validation performance during training was iteration 358,000 with
   validation accuracy 57.258% and loss 1.83948.
 
+**R-CNN (ILSVRC13)**: The pure Caffe instantiation of the [R-CNN](https://github.com/rbgirshick/rcnn) model for ILSVRC13 detection. Download the model (230.8MB) by running `examples/imagenet/get_caffe_rcnn_imagenet_model.sh` from the Caffe root directory. This model was made by transplanting the R-CNN SVM classifiers into a `fc-rcnn` classification layer, provided here as an off-the-shelf Caffe detector. Try the [detection example](http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/examples/detection.ipynb) to see it in action. For the full details, refer to the R-CNN site. *N.B. For research purposes, make use of the official R-CNN package and not this example.*
+
 Additionally, you will probably eventually need some auxiliary data (mean image, synset list, etc.): run `data/ilsvrc12/get_ilsvrc_aux.sh` from the root directory to obtain it.
diff --git a/examples/detection.ipynb b/examples/detection.ipynb
diff --git a/examples/imagenet/get_caffe_rcnn_imagenet_model.sh b/examples/imagenet/get_caffe_rcnn_imagenet_model.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env sh
+# This scripts downloads the Caffe R-CNN ImageNet
+# for ILSVRC13 detection.
+
+MODEL=caffe_rcnn_imagenet_model
+CHECKSUM=42c1556d2d47a9128c4a90e0a9c5341c
+
+if [ -f $MODEL ]; then
+  echo "Model already exists. Checking md5..."
+  os=`uname -s`
+  if [ "$os" = "Linux" ]; then
+    checksum=`md5sum $MODEL | awk '{ print $1 }'`
+  elif [ "$os" = "Darwin" ]; then
+    checksum=`cat $MODEL | md5`
+  fi
+  if [ "$checksum" = "$CHECKSUM" ]; then
+    echo "Model checksum is correct. No need to download."
+    exit 0
+  else
+    echo "Model checksum is incorrect. Need to download again."
+  fi
+fi
+
+echo "Downloading..."
+
+wget --no-check-certificate https://www.dropbox.com/s/0i3etlgmsmgf5ei/$MODEL
+echo "Done. Please run this command again to verify that checksum = $CHECKSUM."
diff --git a/examples/imagenet/rcnn_imagenet_deploy.prototxt b/examples/imagenet/rcnn_imagenet_deploy.prototxt
@@ -0,0 +1,207 @@
+name: "R-CNN-ilsvrc13"
+input: "data"
+input_dim: 10
+input_dim: 3
+input_dim: 227
+input_dim: 227
+layers {
+  name: "conv1"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+  }
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "conv1"
+  top: "conv1"
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layers {
+  name: "norm1"
+  type: LRN
+  bottom: "pool1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layers {
+  name: "conv2"
+  type: CONVOLUTION
+  bottom: "norm1"
+  top: "conv2"
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+  }
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "conv2"
+  top: "conv2"
+}
+layers {
+  name: "pool2"
+  type: POOLING
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layers {
+  name: "norm2"
+  type: LRN
+  bottom: "pool2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layers {
+  name: "conv3"
+  type: CONVOLUTION
+  bottom: "norm2"
+  top: "conv3"
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+  }
+}
+layers {
+  name: "relu3"
+  type: RELU
+  bottom: "conv3"
+  top: "conv3"
+}
+layers {
+  name: "conv4"
+  type: CONVOLUTION
+  bottom: "conv3"
+  top: "conv4"
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+  }
+}
+layers {
+  name: "relu4"
+  type: RELU
+  bottom: "conv4"
+  top: "conv4"
+}
+layers {
+  name: "conv5"
+  type: CONVOLUTION
+  bottom: "conv4"
+  top: "conv5"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+  }
+}
+layers {
+  name: "relu5"
+  type: RELU
+  bottom: "conv5"
+  top: "conv5"
+}
+layers {
+  name: "pool5"
+  type: POOLING
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layers {
+  name: "fc6"
+  type: INNER_PRODUCT
+  bottom: "pool5"
+  top: "fc6"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layers {
+  name: "relu6"
+  type: RELU
+  bottom: "fc6"
+  top: "fc6"
+}
+layers {
+  name: "drop6"
+  type: DROPOUT
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layers {
+  name: "fc7"
+  type: INNER_PRODUCT
+  bottom: "fc6"
+  top: "fc7"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layers {
+  name: "relu7"
+  type: RELU
+  bottom: "fc7"
+  top: "fc7"
+}
+layers {
+  name: "drop7"
+  type: DROPOUT
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+# R-CNN classification layer made from R-CNN ILSVRC13 SVMs.
+layers {
+  name: "fc-rcnn"
+  type: INNER_PRODUCT
+  bottom: "fc7"
+  top: "fc-rcnn"
+  inner_product_param {
+    num_output: 200
+  }
+}
diff --git a/examples/images/fish-bike.jpg b/examples/images/fish-bike.jpg
diff --git a/python/caffe/detector.py b/python/caffe/detector.py
@@ -12,10 +12,6 @@
 The selective_search_ijcv_with_python code required for the selective search
 proposal mode is available at
     https://github.com/sergeyk/selective_search_ijcv_with_python
-
-TODO
-- R-CNN crop mode / crop with context.
-- Bundle with R-CNN model for example.
 """
 import numpy as np
 import os
@@ -29,11 +25,14 @@ class Detector(caffe.Net):
     selective search proposals.
     """
     def __init__(self, model_file, pretrained_file, gpu=False, mean_file=None,
-                 input_scale=None, channel_swap=None):
+                 input_scale=None, channel_swap=None, context_pad=None):
         """
         Take
         gpu, mean_file, input_scale, channel_swap: convenience params for
             setting mode, mean, input scale, and channel order.
+        context_pad: amount of surrounding context to take s.t. a `context_pad`
+            sized border of pixels in the network input image is context, as in
+            R-CNN feature extraction.
         """
         caffe.Net.__init__(self, model_file, pretrained_file)
         self.set_phase_test()
@@ -50,6 +49,8 @@ def __init__(self, model_file, pretrained_file, gpu=False, mean_file=None,
         if channel_swap:
             self.set_channel_swap(self.inputs[0], channel_swap)
 
+        self.configure_crop(context_pad)
+
 
     def detect_windows(self, images_windows):
         """
@@ -58,6 +59,7 @@ def detect_windows(self, images_windows):
 
         Take
         images_windows: (image filename, window list) iterable.
+        context_crop: size of context border to crop in pixels.
 
         Give
         detections: list of {filename: image filename, window: crop coordinates,
@@ -68,8 +70,7 @@ def detect_windows(self, images_windows):
         for image_fname, windows in images_windows:
             image = caffe.io.load_image(image_fname).astype(np.float32)
             for window in windows:
-                window_inputs.append(image[window[0]:window[2],
-                                           window[1]:window[3]])
+                window_inputs.append(self.crop(image, window))
 
         # Run through the net (warping windows to input dimensions).
         caffe_in = np.asarray([self.preprocess(self.inputs[0], window_in)
@@ -106,6 +107,85 @@ def detect_selective_search(self, image_fnames):
         import selective_search_ijcv_with_python as selective_search
         # Make absolute paths so MATLAB can find the files.
         image_fnames = [os.path.abspath(f) for f in image_fnames]
-        windows_list = selective_search.get_windows(image_fnames)
+        windows_list = selective_search.get_windows(
+            image_fnames,
+            cmd='selective_search_rcnn'
+        )
         # Run windowed detection on the selective search list.
         return self.detect_windows(zip(image_fnames, windows_list))
+
+
+    def crop(self, im, window):
+        """
+        Crop a window from the image for detection. Include surrounding context
+        according to the `context_pad` configuration.
+
+        Take
+        im: H x W x K image ndarray to crop.
+        window: bounding box coordinates as ymin, xmin, ymax, xmax.
+
+        Give
+        crop: cropped window.
+        """
+        # Crop window from the image.
+        crop = im[window[0]:window[2], window[1]:window[3]]
+
+        if self.context_pad:
+            box = window.copy()
+            crop_size = self.blobs[self.inputs[0]].width  # assumes square
+            scale = crop_size / (1. * crop_size - self.context_pad * 2)
+            # Crop a box + surrounding context.
+            half_h = (box[2] - box[0] + 1) / 2.
+            half_w = (box[3] - box[1] + 1) / 2.
+            center = (box[0] + half_h, box[1] + half_w)
+            scaled_dims = scale * np.array((-half_h, -half_w, half_h, half_w))
+            box = np.round(np.tile(center, 2) + scaled_dims)
+            full_h = box[2] - box[0] + 1
+            full_w = box[3] - box[1] + 1
+            scale_h = crop_size / full_h
+            scale_w = crop_size / full_w
+            pad_y = round(max(0, -box[0]) * scale_h)  # amount out-of-bounds
+            pad_x = round(max(0, -box[1]) * scale_w)
+
+            # Clip box to image dimensions.
+            im_h, im_w = im.shape[:2]
+            box = np.clip(box, 0., [im_h, im_w, im_h, im_w])
+            clip_h = box[2] - box[0] + 1
+            clip_w = box[3] - box[1] + 1
+            assert(clip_h > 0 and clip_w > 0)
+            crop_h = round(clip_h * scale_h)
+            crop_w = round(clip_w * scale_w)
+            if pad_y + crop_h > crop_size:
+                crop_h = crop_size - pad_y
+            if pad_x + crop_w > crop_size:
+                crop_w = crop_size - pad_x
+
+            # collect with context padding and place in input
+            # with mean padding
+            context_crop = im[box[0]:box[2], box[1]:box[3]]
+            context_crop = caffe.io.resize_image(context_crop, (crop_h, crop_w))
+            crop = self.crop_mean.copy()
+            crop[pad_y:(pad_y + crop_h), pad_x:(pad_x + crop_w)] = context_crop
+
+        return crop
+
+
+    def configure_crop(self, context_pad):
+        """
+        Configure amount of context for cropping.
+        If context is included, make the special input mean for context padding.
+
+        Take
+        context_pad: amount of context for cropping.
+        """
+        self.context_pad = context_pad
+        if self.context_pad:
+            input_scale = self.input_scale.get(self.inputs[0])
+            channel_order = self.channel_swap.get(self.inputs[0])
+            # Padding context crops needs the mean in unprocessed input space.
+            self.crop_mean = self.mean[self.inputs[0]].copy()
+            self.crop_mean = self.crop_mean.transpose((1,2,0))
+            channel_order_inverse = [channel_order.index(i)
+                                     for i in range(self.crop_mean.shape[2])]
+            self.crop_mean = self.crop_mean[:,:, channel_order_inverse]
+            self.crop_mean /= input_scale