[WIP] polyval: Constant-time software implementation

tarcieri · tarcieri · commit fe4e76218faa · 2019-09-03T08:32:48.000-07:00
Adapts BearSSL's `ghash_ctmul64.c` into a constant-time software backend
for POLYVAL.
diff --git a/.travis.yml b/.travis.yml
@@ -6,7 +6,7 @@ rust:
   - nightly
 
 script:
-  - cargo test --all --exclude polyval --release
+  - cargo test --all --release
   - cargo test --all --all-features --release
 
 env:
@@ -23,13 +23,15 @@ matrix:
       rust: 1.34.0
       env: {} # clear `-D warnings` above; allow warnings
 
-    # polyval presently needs either RUSTFLAGS or non-default features
+    # Test `polyval` with the PCLMULQDQ-accelerated backend
     - name: "Rust: 1.32.0 (polyval)"
       rust: 1.34.0
-      script: ./test_polyval.sh
+      env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
+      script: cd polyval && cargo test --no-default-features --release --tests
     - name: "Rust: stable (polyval)"
       rust: stable
-      script: ./test_polyval.sh
+      env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
+      script: cd polyval && cargo test --no-default-features --release --tests
 
     # no_std build
     - name: "Rust: stable (thumbv7em-none-eabihf)"
diff --git a/polyval/Cargo.toml b/polyval/Cargo.toml
@@ -22,9 +22,9 @@ zeroize = { version = "0.10", optional = true, default-features = false }
 hex-literal = "0.1"
 
 [features]
-default = []
+default = ["soft"]
 std = ["universal-hash/std"]
-insecure-soft = []
+soft = []
 
 [badges]
 maintenance = { status = "experimental" }
diff --git a/polyval/src/field.rs b/polyval/src/field.rs
@@ -77,14 +77,7 @@ impl<B: Backend> Mul for Element<B> {
     ///
     /// [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3
     fn mul(self, rhs: Self) -> Self {
-        let t1 = self.0.clmul(rhs.0, 0x00);
-        let t2 = self.0.clmul(rhs.0, 0x01);
-        let t3 = self.0.clmul(rhs.0, 0x10);
-        let t4 = self.0.clmul(rhs.0, 0x11);
-        let t5 = t2 + t3;
-        let t6 = t4 + t5.shr64();
-        let t7 = (t1 + t5.shl64()).reduce();
-        Element(t6 + t7)
+        Element(self.0 * rhs.0)
     }
 }
 
diff --git a/polyval/src/field/backend.rs b/polyval/src/field/backend.rs
@@ -8,11 +8,11 @@
 ))]
 mod pclmulqdq;
 
-#[cfg(feature = "insecure-soft")]
+#[cfg(feature = "soft")]
 pub mod soft;
 
 use super::Block;
-use core::ops::Add;
+use core::ops::{Add, Mul};
 
 #[cfg(not(any(
     all(
@@ -21,12 +21,12 @@ use core::ops::Add;
         target_feature = "sse4.1",
         any(target_arch = "x86", target_arch = "x86_64")
     ),
-    feature = "insecure-soft"
+    feature = "soft"
 )))]
 compile_error!(
     "no backends available! On x86/x86-64 platforms, enable intrinsics with \
      RUSTFLAGS=\"-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1\" or \
-     enable **INSECURE** portable emulation with the `insecure-soft` feature"
+     enable portable emulation with the `soft` Cargo feature"
 );
 
 #[cfg(all(
@@ -44,35 +44,12 @@ pub(crate) use self::pclmulqdq::M128i;
         target_feature = "sse4.1",
         any(target_arch = "x86", target_arch = "x86_64")
     )),
-    feature = "insecure-soft"
+    feature = "soft"
 ))]
 pub(crate) use self::soft::U64x2 as M128i;
 
 /// Field arithmetic backend
-pub trait Backend: Add<Output = Self> + Copy + From<Block> + Into<Block> + From<u128> {
-    /// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
-    /// Algorithm 4: "Montgomery reduction"
-    ///
-    /// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
-    fn reduce(self) -> Self {
-        // Mask value used when performing Montgomery fast reduction.
-        // This corresponds to POLYVAL's polynomial with the highest bit unset.
-        let mask = Self::from(1 << 127 | 1 << 126 | 1 << 121 | 1);
-        let a = mask.clmul(self, 0x01);
-        let b = self.shuffle() + a;
-        let c = mask.clmul(b, 0x01);
-        b.shuffle() + c
-    }
-
-    /// Carryless multiplication
-    fn clmul(self, rhs: Self, imm: u8) -> Self;
-
-    /// Swap the hi and low 64-bit halves of the register
-    fn shuffle(self) -> Self;
-
-    /// Shift the contents of the register left by 64-bits
-    fn shl64(self) -> Self;
-
-    /// Shift the contents of the register right by 64-bits
-    fn shr64(self) -> Self;
+pub trait Backend:
+    Add<Output = Self> + Mul<Output = Self> + Copy + From<Block> + Into<Block> + From<u128>
+{
 }
diff --git a/polyval/src/field/backend/pclmulqdq.rs b/polyval/src/field/backend/pclmulqdq.rs
@@ -11,13 +11,15 @@ use core::arch::x86_64::*;
 
 use super::Backend;
 use crate::field::Block;
-use core::ops::Add;
+use core::ops::{Add, Mul};
 
 /// Wrapper for `__m128i` - a 128-bit XMM register (SSE2)
 #[repr(align(16))]
 #[derive(Copy, Clone)]
 pub struct M128i(__m128i);
 
+impl Backend for M128i {}
+
 impl From<Block> for M128i {
     fn from(bytes: Block) -> M128i {
         M128i(unsafe { _mm_loadu_si128(bytes.as_ptr() as *const __m128i) })
@@ -51,12 +53,41 @@ impl Add for M128i {
     }
 }
 
-impl Backend for M128i {
+#[allow(clippy::suspicious_arithmetic_impl)]
+impl Mul for M128i {
+    type Output = Self;
+
+    /// Computes POLYVAL multiplication over GF(2^128).
+    fn mul(self, rhs: Self) -> Self {
+        let t1 = self.clmul(rhs, 0x00);
+        let t2 = self.clmul(rhs, 0x01);
+        let t3 = self.clmul(rhs, 0x10);
+        let t4 = self.clmul(rhs, 0x11);
+        let t5 = t2 + t3;
+        (t4 + t5.shr64()) + (t1 + t5.shl64()).reduce()
+    }
+}
+
+impl M128i {
     /// Wrapper for PCLMULQDQ
     fn clmul(self, rhs: Self, imm: u8) -> Self {
         M128i(unsafe { pclmulqdq(self.0, rhs.0, imm) })
     }
 
+    /// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
+    /// Algorithm 4: "Montgomery reduction"
+    ///
+    /// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
+    fn reduce(self) -> Self {
+        // Mask value used when performing Montgomery fast reduction.
+        // This corresponds to POLYVAL's polynomial with the highest bit unset.
+        let mask = Self::from(1 << 127 | 1 << 126 | 1 << 121 | 1);
+        let a = mask.clmul(self, 0x01);
+        let b = self.shuffle() + a;
+        let c = mask.clmul(b, 0x01);
+        b.shuffle() + c
+    }
+
     fn shuffle(self) -> Self {
         M128i(unsafe { shufpd1(self.0) })
     }
diff --git a/polyval/src/field/backend/soft.rs b/polyval/src/field/backend/soft.rs
@@ -1,18 +1,45 @@
-//! Software emulation support for CLMUL hardware intrinsics.
-//!
-//! WARNING: Not constant time! Should be made constant-time or disabled by default.
-
-// TODO(tarcieri): performance-oriented constant-time implementation
-// See: <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c>
+//! Constant-time software implementation of POLYVAL
+
+// Adapted from BearSSL's `ghash_ctmul64.c`
+// <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c;hb=4b6046412bf927d6424f20fc7ee495bb96dbd227>
+//
+// Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#![allow(missing_docs)]
 
 use super::Backend;
 use crate::field::Block;
-use core::{convert::TryInto, ops::Add};
+use core::{
+    convert::TryInto,
+    ops::{Add, Mul},
+};
 
-/// 2 x `u64` values emulating an XMM register
+/// 2 x `u64` values
 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
 pub struct U64x2(u64, u64);
 
+impl Backend for U64x2 {}
+
 impl From<Block> for U64x2 {
     fn from(bytes: Block) -> U64x2 {
         U64x2(
@@ -43,53 +70,104 @@ impl From<U64x2> for u128 {
     }
 }
 
+#[allow(clippy::suspicious_arithmetic_impl)]
 impl Add for U64x2 {
     type Output = Self;
 
     /// Adds two POLYVAL field elements.
-    fn add(self, rhs: Self) -> Self {
+    fn add(self, rhs: Self) -> Self::Output {
         U64x2(self.0 ^ rhs.0, self.1 ^ rhs.1)
     }
 }
 
-impl Backend for U64x2 {
-    fn clmul(self, other: Self, imm: u8) -> Self {
-        let (a, b) = match imm.into() {
-            0x00 => (self.0, other.0),
-            0x01 => (self.1, other.0),
-            0x10 => (self.0, other.1),
-            0x11 => (self.1, other.1),
-            _ => unreachable!(),
-        };
+#[allow(clippy::suspicious_arithmetic_impl)]
+impl Mul for U64x2 {
+    type Output = Self;
 
-        let mut result = U64x2(0, 0);
+    /// Computes POLYVAL multiplication over GF(2^128).
+    // TODO(tarcieri): actually adapt the arithmetic below from GHASH
+    fn mul(self, rhs: Self) -> Self {
+        let h0 = self.0;
+        let h1 = self.1;
+        let h0r = rev64(h0);
+        let h1r = rev64(h1);
+        let h2 = h0 ^ h1;
+        let h2r = h0r ^ h1r;
+
+        let y0 = rhs.0;
+        let y1 = rhs.1;
+        let y0r = rev64(y0);
+        let y1r = rev64(y1);
+        let y2 = y0 ^ y1;
+        let y2r = y0r ^ y1r;
+        let z0 = bmul64(y0, h0);
+        let z1 = bmul64(y1, h1);
+
+        let mut z2 = bmul64(y2, h2);
+        let mut z0h = bmul64(y0r, h0r);
+        let mut z1h = bmul64(y1r, h1r);
+        let mut z2h = bmul64(y2r, h2r);
+
+        z2 ^= z0 ^ z1;
+        z2h ^= z0h ^ z1h;
+        z0h = rev64(z0h) >> 1;
+        z1h = rev64(z1h) >> 1;
+        z2h = rev64(z2h) >> 1;
+
+        let mut v0 = z0;
+        let mut v1 = z0h ^ z2;
+        let mut v2 = z1 ^ z2h;
+        let mut v3 = z1h;
+
+        v3 = v3 << 1 | v2 >> 63;
+        v2 = v2 << 1 | v1 >> 63;
+        v1 = v1 << 1 | v0 >> 63;
+        v0 <<= 1;
+
+        v2 ^= v0 ^ v0 >> 1 ^ v0 >> 2 ^ v0 >> 7;
+        v1 ^= v0 << 63 ^ v0 << 62 ^ v0 << 57;
+        v3 ^= v1 ^ v1 >> 1 ^ v1 >> 2 ^ v1 >> 7;
+        v2 ^= v1 << 63 ^ v1 << 62 ^ v1 << 57;
+
+        U64x2(v2, v3)
+    }
+}
 
-        for i in 0..64 {
-            if b & (1 << i) != 0 {
-                result.1 ^= a;
-            }
+fn rev64(mut x: u64) -> u64 {
+    x = ((x & 0x5555_5555_5555_5555) << 1) | ((x >> 1) & 0x5555_5555_5555_5555);
+    x = ((x & 0x3333_3333_3333_3333) << 2) | ((x >> 2) & 0x3333_3333_3333_3333);
+    x = ((x & 0x0f0f_0f0f_0f0f_0f0f) << 4) | ((x >> 4) & 0x0f0f_0f0f_0f0f_0f0f);
+    x = ((x & 0x00ff_00ff_00ff_00ff) << 8) | ((x >> 8) & 0x00ff_00ff_00ff_00ff);
+    x = ((x & 0xffff_0000_ffff) << 16) | ((x >> 16) & 0xffff_0000_ffff);
+    (x << 32) | (x >> 32)
+}
 
-            result.0 >>= 1;
+fn bmul64(x: u64, y: u64) -> u64 {
+    let x0 = x & 0x1111_1111_1111_1111;
+    let x1 = x & 0x2222_2222_2222_2222;
+    let x2 = x & 0x4444_4444_4444_4444;
+    let x3 = x & 0x8888_8888_8888_8888;
+    let y0 = y & 0x1111_1111_1111_1111;
+    let y1 = y & 0x2222_2222_2222_2222;
+    let y2 = y & 0x4444_4444_4444_4444;
+    let y3 = y & 0x8888_8888_8888_8888;
 
-            if result.1 & 1 != 0 {
-                result.0 ^= 1 << 63;
-            }
+    let mut z0 =
+        x0.wrapping_mul(y0) ^ x1.wrapping_mul(y3) ^ x2.wrapping_mul(y2) ^ x3.wrapping_mul(y1);
 
-            result.1 >>= 1;
-        }
+    let mut z1 =
+        x0.wrapping_mul(y1) ^ x1.wrapping_mul(y0) ^ x2.wrapping_mul(y3) ^ x3.wrapping_mul(y2);
 
-        result
-    }
+    let mut z2 =
+        x0.wrapping_mul(y2) ^ x1.wrapping_mul(y1) ^ x2.wrapping_mul(y0) ^ x3.wrapping_mul(y3);
 
-    fn shuffle(self) -> Self {
-        U64x2(self.1, self.0)
-    }
+    let mut z3 =
+        x0.wrapping_mul(y3) ^ x1.wrapping_mul(y2) ^ x2.wrapping_mul(y1) ^ x3.wrapping_mul(y0);
 
-    fn shl64(self) -> Self {
-        U64x2(0, self.0)
-    }
+    z0 &= 0x1111_1111_1111_1111;
+    z1 &= 0x2222_2222_2222_2222;
+    z2 &= 0x4444_4444_4444_4444;
+    z3 &= 0x8888_8888_8888_8888;
 
-    fn shr64(self) -> Self {
-        U64x2(self.1, 0)
-    }
+    z0 | z1 | z2 | z3
 }
diff --git a/polyval/src/lib.rs b/polyval/src/lib.rs
@@ -18,9 +18,8 @@
 //!   - x86(-64) CPU: `target-cpu=sandybridge` or newer
 //!   - SSE2 + SSE4.1: `target-feature=+sse2,+sse4.1`
 //!
-//! An **INSECURE** (variable timing) portable implementation is gated behind
-//! the `insecure-soft` cargo feature. Use of this implementation is
-//! **NOT RECOMMENDED** and may potentially leak the POLYVAL key!
+//! If `RUSTFLAGS` are not provided, this crate will fall back to a much slower
+//! software-only implementation.
 //!
 //! ## Relationship to GHASH
 //!
diff --git a/test_polyval.sh b/test_polyval.sh