[quant] Support quantization of embedding lookup operators (pytorch#44207)

supriyar · facebook-github-bot · commit 6013a29fc033 · 2020-09-08T19:03:59.000-07:00
Summary: Pull Request resolved: pytorch#44207 Use existing embedding_bag operator but set offsets to [0, 1, .. len(indices)] Test Plan: python test/test_quantization.py TestEmbeddingOps.test_embedding_byte Imported from OSS Reviewed By: vkuzo Differential Revision: D23547385 fbshipit-source-id: ccce348bc192c6a4a65a8eca4c8b90f99f40f1b1
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -17,8 +17,9 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_byte(
     bool sparse,
     const c10::optional<at::Tensor>& per_sample_weights_,
     bool include_last_offset) {
-
-  TORCH_CHECK(offsets_in.has_value(), "embedding_bag_byte_rowwise_offsets expects offsets to be set");
+  TORCH_CHECK(
+      offsets_in.has_value(),
+      "embedding_bag_byte_rowwise_offsets expects offsets to be set");
   auto offsets = offsets_in.value();
   auto offsets_data = offsets.data_ptr<int64_t>();
   const auto indices_data = indices.data_ptr<int64_t>();
@@ -123,7 +124,9 @@ Tensor embedding_bag_byte_rowwise_offsets(
     bool include_last_offset) {
   TORCH_CHECK(weight.scalar_type() == at::kByte);
   TORCH_CHECK(weight.ndimension() == 2);
-  TORCH_CHECK(offsets_in.has_value(), "embedding_bag_byte_rowwise_offsets expects offsets to be set");
+  TORCH_CHECK(
+      offsets_in.has_value(),
+      "embedding_bag_byte_rowwise_offsets expects offsets to be set");
 
   auto offsets = offsets_in.value();
   auto offsets_data = offsets.data_ptr<int64_t>();
@@ -221,7 +224,9 @@ Tensor embedding_bag_4bit_rowwise_offsets(
     const c10::optional<Tensor>& per_sample_weights_,
     const c10::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
-  TORCH_CHECK(offsets_in.has_value(), "embedding_bag_4bit_rowwise_offsets expects offsets to be set");
+  TORCH_CHECK(
+      offsets_in.has_value(),
+      "embedding_bag_4bit_rowwise_offsets expects offsets to be set");
 
   TORCH_CHECK(weight.ndimension() == 2);
   TORCH_CHECK(indices.ndimension() == 1);
@@ -423,9 +428,31 @@ class QEmbeddingBag final {
   }
 };
 
+template <int bit_rate>
+class QEmbedding final {
+ public:
+  static at::Tensor run(
+      const c10::intrusive_ptr<EmbeddingPackedParamsBase>& packed_weight,
+      const Tensor& indices,
+      bool sparse) {
+    const auto offsets_size = indices.numel();
+    at::Tensor offsets = at::arange(0, offsets_size, at::kLong);
+    at::Tensor output;
+    if (bit_rate == 8) {
+      return packed_weight->embeddingbag_byte(
+          indices, offsets, sparse, c10::nullopt, false);
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          "Currently only support 8-bit embedding quantization");
+    }
+    return output;
+  }
+};
+
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   // Function that works on TorchBind packed weights.
   m.impl("embedding_bag_byte", TORCH_FN(QEmbeddingBag<8>::run));
+  m.impl("embedding_byte", TORCH_FN(QEmbedding<8>::run));
 
   // Functions that work on at::Tensor packed weight.
   m.impl(
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
@@ -110,6 +110,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def("embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor");
   m.def("embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
   m.def("embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
+  m.def("embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool sparse=False) -> Tensor");
   m.def("celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor");
   m.def("hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor");
   m.def("group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor");
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
@@ -22,6 +22,7 @@
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
     override_quantized_engine, supported_qengines, override_qengines
+from torch.quantization import PerChannelMinMaxObserver
 
 np_dtype = {
     torch.quint8 : np.uint8,
@@ -2716,7 +2717,7 @@ def test_qlinear_unpack(self, W, use_channelwise):
 
 
 @unittest.skipIf(sys.platform == "darwin", "Known test failure on Mac.")
-class TestQuantizedEmbeddingBag(TestCase):
+class TestQuantizedEmbeddingOps(TestCase):
     def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate):
         weights = torch.from_numpy((np.random.random_sample((
             num_embeddings, embedding_dim)) + 1).astype(np.float32))
@@ -2727,7 +2728,6 @@ def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embe
         if bit_rate == 8:
             # Check numerics of prepack function that accepts qtensor as input.
             # We use min-max observer to mimic the quantization performed in the original function.
-            from torch.quantization import PerChannelMinMaxObserver
             obs = PerChannelMinMaxObserver(dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0)
             obs(weights)
             # Get the scale and zero point for the weight tensor
@@ -2884,7 +2884,6 @@ def get_reference_result(
 
         if bit_rate == 8:
             # Test operator that accepts TorchBind packed weights.
-            from torch.quantization import PerChannelMinMaxObserver
             obs = PerChannelMinMaxObserver(dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0)
             obs(weights)
             # Get the scale and zero point for the weight tensor
@@ -2931,6 +2930,37 @@ def test_embedding_bag_4bit_rowwise_offsets(self, num_embeddings,
                                                include_last_offset, atol=0.1,
                                                rtol=1e-2)
 
+    """ Tests the correctness of the quantized embedding lookup operator """
+    @given(num_embeddings=st.integers(10, 100),
+           embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0))
+    def test_embedding_byte(self, num_embeddings, embedding_dim):
+        quant_op = torch.ops.quantized.embedding_byte
+        prepack_op = torch.ops.quantized.embedding_bag_prepack
+
+        weights = torch.from_numpy((np.random.random_sample((
+            num_embeddings, embedding_dim)) + 1).astype(np.float32))
+
+        obs = PerChannelMinMaxObserver(dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0)
+        obs(weights)
+        # Get the scale and zero point for the weight tensor
+        qparams = obs.calculate_qparams()
+
+        # Quantize the weights to 8bits
+        qweight = torch.quantize_per_channel(weights, qparams[0], qparams[1], axis=0, dtype=torch.quint8)
+        max_segments = 5
+        max_segment_length = 20
+        num_lengths = np.random.randint(1, max_segments + 1)
+        lengths = np.random.randint(1, max_segment_length + 1,
+                                    size=num_lengths).astype(np.int32)
+        num_indices = np.sum(lengths)
+        indices = torch.from_numpy(np.random.randint(
+            low=0, high=num_embeddings, size=num_indices, dtype=np.int64))
+
+        packed_weight = prepack_op(qweight)
+        qresult = quant_op(packed_weight, indices, sparse=False)
+
+        ref = torch.embedding(weights, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False)
+        torch.testing.assert_allclose(ref, qresult, atol=0.005, rtol=1e-3)
 
 class TestQuantizedConv(unittest.TestCase):
     def _test_qconv_unpack_impl(self, qconv_prepack_fn, qconv_unpack_fn, inputs,
diff --git a/test/test_quantization.py b/test/test_quantization.py
@@ -13,7 +13,7 @@
 from quantization.test_quantized_op import TestDynamicQuantizedLinear  # noqa: F401
 from quantization.test_quantized_op import TestComparatorOps  # noqa: F401
 from quantization.test_quantized_op import TestPadding  # noqa: F401
-from quantization.test_quantized_op import TestQuantizedEmbeddingBag  # noqa: F401
+from quantization.test_quantized_op import TestQuantizedEmbeddingOps  # noqa: F401
 
 # Quantized Functional
 from quantization.test_quantized_functional import TestQuantizedFunctional  # noqa: F401