polyval: Constant-time software implementation

Adapts BearSSL's `ghash_ctmul64.c` into a constant-time software backend for POLYVAL.
RustCrypto · tarcieri · Sep 3, 2019 · Sep 1, 2019 · Sep 3, 2019 · 452a2ffa6e47023d2df674c1972749d06ea9a7cc
commit 452a2ffa6e47023d2df674c1972749d06ea9a7cc
diff --git a/.travis.yml b/.travis.yml
@@ -6,7 +6,7 @@ rust:
   - nightly
 
 script:
-  - cargo test --all --exclude polyval --release
+  - cargo test --all --release
   - cargo test --all --all-features --release
 
 env:
@@ -23,36 +23,32 @@ matrix:
       rust: 1.34.0
       env: {} # clear `-D warnings` above; allow warnings
 
-    # polyval presently needs either RUSTFLAGS or non-default features
-    - name: "Rust: 1.32.0 (polyval)"
+    # Test `polyval` with the PCLMULQDQ-accelerated backend
+    - name: "Rust: 1.34.0 (polyval)"
       rust: 1.34.0
-      script: ./test_polyval.sh
+      env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
+      script: cd polyval && cargo test --release --tests
     - name: "Rust: stable (polyval)"
       rust: stable
-      script: ./test_polyval.sh
+      env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
+      script: cd polyval && cargo test --release --tests
 
     # no_std build
     - name: "Rust: stable (thumbv7em-none-eabihf)"
       rust: stable
-      install:
-        - rustup target add thumbv7em-none-eabihf
-      script:
-        - cargo build --all --exclude polyval --target thumbv7em-none-eabihf --release
+      install: rustup target add thumbv7em-none-eabihf
+      script: cargo build --all --target thumbv7em-none-eabihf --release
     - name "Rust: nightly (benches)"
       rust: nightly
       script: cargo build --all-features --benches
     - name: rustfmt
       rust: stable
-      install:
-        - rustup component add rustfmt
-      script:
-        - cargo fmt --all -- --check
+      install: rustup component add rustfmt
+      script: cargo fmt --all -- --check
     - name: clippy
       rust: stable
-      install:
-        - rustup component add clippy
-      script:
-        - cargo clippy --all
+      install: rustup component add clippy
+      script: cargo clippy --all
 
 branches:
   only:

diff --git a/poly1305/Cargo.toml b/poly1305/Cargo.toml
@@ -16,7 +16,6 @@ universal-hash = { version = "0.2", default-features = false }
 zeroize = { version = "0.10", optional = true, default-features = false }
 
 [features]
-default = []
 std = ["universal-hash/std"]
 
 [badges]

diff --git a/polyval/Cargo.toml b/polyval/Cargo.toml
@@ -22,9 +22,7 @@ zeroize = { version = "0.10", optional = true, default-features = false }
 hex-literal = "0.1"
 
 [features]
-default = []
 std = ["universal-hash/std"]
-insecure-soft = []
 
 [badges]
 maintenance = { status = "experimental" }

diff --git a/polyval/src/field.rs b/polyval/src/field.rs
@@ -77,14 +77,7 @@ impl<B: Backend> Mul for Element<B> {
     ///
     /// [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3
     fn mul(self, rhs: Self) -> Self {
-        let t1 = self.0.clmul(rhs.0, 0x00);
-        let t2 = self.0.clmul(rhs.0, 0x01);
-        let t3 = self.0.clmul(rhs.0, 0x10);
-        let t4 = self.0.clmul(rhs.0, 0x11);
-        let t5 = t2 + t3;
-        let t6 = t4 + t5.shr64();
-        let t7 = (t1 + t5.shl64()).reduce();
-        Element(t6 + t7)
+        Element(self.0 * rhs.0)
     }
 }
 

diff --git a/polyval/src/field/backend.rs b/polyval/src/field/backend.rs
@@ -7,27 +7,12 @@
     any(target_arch = "x86", target_arch = "x86_64")
 ))]
 mod pclmulqdq;
-
-#[cfg(feature = "insecure-soft")]
-pub mod soft;
+mod soft;
 
 use super::Block;
-use core::ops::Add;
+use core::ops::{Add, Mul};
 
-#[cfg(not(any(
-    all(
-        target_feature = "pclmulqdq",
-        target_feature = "sse2",
-        target_feature = "sse4.1",
-        any(target_arch = "x86", target_arch = "x86_64")
-    ),
-    feature = "insecure-soft"
-)))]
-compile_error!(
-    "no backends available! On x86/x86-64 platforms, enable intrinsics with \
-     RUSTFLAGS=\"-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1\" or \
-     enable **INSECURE** portable emulation with the `insecure-soft` feature"
-);
+// TODO(tarcieri): runtime selection of PCLMULQDQ based on CPU features
 
 #[cfg(all(
     target_feature = "pclmulqdq",
@@ -37,42 +22,16 @@ compile_error!(
 ))]
 pub(crate) use self::pclmulqdq::M128i;
 
-#[cfg(all(
-    not(all(
-        target_feature = "pclmulqdq",
-        target_feature = "sse2",
-        target_feature = "sse4.1",
-        any(target_arch = "x86", target_arch = "x86_64")
-    )),
-    feature = "insecure-soft"
-))]
+#[cfg(not(all(
+    target_feature = "pclmulqdq",
+    target_feature = "sse2",
+    target_feature = "sse4.1",
+    any(target_arch = "x86", target_arch = "x86_64")
+)))]
 pub(crate) use self::soft::U64x2 as M128i;
 
 /// Field arithmetic backend
-pub trait Backend: Add<Output = Self> + Copy + From<Block> + Into<Block> + From<u128> {
-    /// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
-    /// Algorithm 4: "Montgomery reduction"
-    ///
-    /// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
-    fn reduce(self) -> Self {
-        // Mask value used when performing Montgomery fast reduction.
-        // This corresponds to POLYVAL's polynomial with the highest bit unset.
-        let mask = Self::from(1 << 127 | 1 << 126 | 1 << 121 | 1);
-        let a = mask.clmul(self, 0x01);
-        let b = self.shuffle() + a;
-        let c = mask.clmul(b, 0x01);
-        b.shuffle() + c
-    }
-
-    /// Carryless multiplication
-    fn clmul(self, rhs: Self, imm: u8) -> Self;
-
-    /// Swap the hi and low 64-bit halves of the register
-    fn shuffle(self) -> Self;
-
-    /// Shift the contents of the register left by 64-bits
-    fn shl64(self) -> Self;
-
-    /// Shift the contents of the register right by 64-bits
-    fn shr64(self) -> Self;
+pub trait Backend:
+    Copy + Add<Output = Self> + Mul<Output = Self> + From<Block> + Into<Block>
+{
 }
diff --git a/polyval/src/field/backend/pclmulqdq.rs b/polyval/src/field/backend/pclmulqdq.rs
@@ -1,30 +1,33 @@
 //! Support for the PCLMULQDQ CPU intrinsic on `x86` and `x86_64` target
 //! architectures.
 
-// The code below uses `loadu`/`storeu` to support unaligned loads/stores
-#![allow(clippy::cast_ptr_alignment)]
-
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
 use super::Backend;
 use crate::field::Block;
-use core::ops::Add;
+use core::ops::{Add, Mul};
 
 /// Wrapper for `__m128i` - a 128-bit XMM register (SSE2)
 #[repr(align(16))]
 #[derive(Copy, Clone)]
 pub struct M128i(__m128i);
 
+impl Backend for M128i {}
+
 impl From<Block> for M128i {
+    // `_mm_loadu_si128` performs an unaligned load
+    #[allow(clippy::cast_ptr_alignment)]
     fn from(bytes: Block) -> M128i {
         M128i(unsafe { _mm_loadu_si128(bytes.as_ptr() as *const __m128i) })
     }
 }
 
 impl From<M128i> for Block {
+    // `_mm_storeu_si128` performs an unaligned store
+    #[allow(clippy::cast_ptr_alignment)]
     fn from(xmm: M128i) -> Block {
         let mut result = Block::default();
 
@@ -36,12 +39,6 @@ impl From<M128i> for Block {
     }
 }
 
-impl From<u128> for M128i {
-    fn from(x: u128) -> M128i {
-        M128i(unsafe { _mm_loadu_si128(&x as *const u128 as *const __m128i) })
-    }
-}
-
 impl Add for M128i {
     type Output = Self;
 
@@ -51,23 +48,41 @@ impl Add for M128i {
     }
 }
 
-impl Backend for M128i {
-    /// Wrapper for PCLMULQDQ
-    fn clmul(self, rhs: Self, imm: u8) -> Self {
-        M128i(unsafe { pclmulqdq(self.0, rhs.0, imm) })
-    }
-
-    fn shuffle(self) -> Self {
-        M128i(unsafe { shufpd1(self.0) })
-    }
+impl Mul for M128i {
+    type Output = Self;
 
-    fn shl64(self) -> Self {
-        M128i(unsafe { pslldq8(self.0) })
+    /// Computes carryless POLYVAL multiplication over GF(2^128).
+    fn mul(self, rhs: Self) -> Self {
+        unsafe {
+            let t1 = pclmulqdq(self.0, rhs.0, 0x00);
+            let t2 = pclmulqdq(self.0, rhs.0, 0x01);
+            let t3 = pclmulqdq(self.0, rhs.0, 0x10);
+            let t4 = pclmulqdq(self.0, rhs.0, 0x11);
+            let t5 = xor(t2, t3);
+            let t6 = xor(t4, psrldq8(t5));
+            let t7 = xor(t1, pslldq8(t5));
+            M128i(xor(t6, reduce(t7)))
+        }
     }
+}
 
-    fn shr64(self) -> Self {
-        M128i(unsafe { psrldq8(self.0) })
-    }
+/// Mask value used when performing Montgomery fast reduction.
+/// This corresponds to POLYVAL's polynomial with the highest bit unset.
+const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;
+
+/// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
+/// Algorithm 4: "Montgomery reduction"
+///
+/// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
+unsafe fn reduce(x: __m128i) -> __m128i {
+    // `_mm_loadu_si128` performs an unaligned load
+    // (`u128` is not necessarily aligned to 16-bytes)
+    #[allow(clippy::cast_ptr_alignment)]
+    let mask = _mm_loadu_si128(&MASK as *const u128 as *const __m128i);
+    let a = pclmulqdq(mask, x, 0x01);
+    let b = xor(shufpd1(x), a);
+    let c = pclmulqdq(mask, b, 0x01);
+    xor(shufpd1(b), c)
 }
 
 #[target_feature(enable = "sse2", enable = "sse4.1")]
@@ -94,6 +109,8 @@ unsafe fn psrldq8(a: __m128i) -> __m128i {
 // TODO(tarcieri): _mm256_clmulepi64_epi128 (vpclmulqdq)
 #[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
 unsafe fn pclmulqdq(a: __m128i, b: __m128i, imm: u8) -> __m128i {
+    // The `imm` value passed to `_mm_clmulepi64_si128` needs to be a literal
+    // value since it ends up being encoded into the CPU instruction.
     match imm {
         // Low-Low: `clmul(a[0..8], b[0..8])` (PCLMULLQLQDQ)
         0x00 => _mm_clmulepi64_si128(a, b, 0x00),

diff --git a/polyval/src/field/backend/soft.rs b/polyval/src/field/backend/soft.rs
@@ -1,18 +1,23 @@
-//! Software emulation support for CLMUL hardware intrinsics.
+//! Constant-time software implementation of POLYVAL
 //!
-//! WARNING: Not constant time! Should be made constant-time or disabled by default.
-
-// TODO(tarcieri): performance-oriented constant-time implementation
-// See: <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c>
+//! Adapted from BearSSL's `ghash_ctmul64.c`
+//! <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c;hb=4b6046412bf927d6424f20fc7ee495bb96dbd227>
+//!
+//! Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
 
 use super::Backend;
 use crate::field::Block;
-use core::{convert::TryInto, ops::Add};
+use core::{
+    convert::TryInto,
+    ops::{Add, Mul},
+};
 
-/// 2 x `u64` values emulating an XMM register
+/// 2 x `u64` values
 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
 pub struct U64x2(u64, u64);
 
+impl Backend for U64x2 {}
+
 impl From<Block> for U64x2 {
     fn from(bytes: Block) -> U64x2 {
         U64x2(
@@ -29,67 +34,124 @@ impl From<U64x2> for Block {
     }
 }
 
-impl From<u128> for U64x2 {
-    fn from(x: u128) -> U64x2 {
-        let lo = (x & 0xFFFF_FFFFF) as u64;
-        let hi = (x >> 64) as u64;
-        U64x2(lo, hi)
-    }
-}
-
 impl From<U64x2> for u128 {
     fn from(u64x2: U64x2) -> u128 {
         u128::from(u64x2.0) | (u128::from(u64x2.1) << 64)
     }
 }
 
+#[allow(clippy::suspicious_arithmetic_impl)]
 impl Add for U64x2 {
     type Output = Self;
 
     /// Adds two POLYVAL field elements.
-    fn add(self, rhs: Self) -> Self {
+    fn add(self, rhs: Self) -> Self::Output {
         U64x2(self.0 ^ rhs.0, self.1 ^ rhs.1)
     }
 }
 
-impl Backend for U64x2 {
-    fn clmul(self, other: Self, imm: u8) -> Self {
-        let (a, b) = match imm.into() {
-            0x00 => (self.0, other.0),
-            0x01 => (self.1, other.0),
-            0x10 => (self.0, other.1),
-            0x11 => (self.1, other.1),
-            _ => unreachable!(),
-        };
-
-        let mut result = U64x2(0, 0);
-
-        for i in 0..64 {
-            if b & (1 << i) != 0 {
-                result.1 ^= a;
-            }
-
-            result.0 >>= 1;
-
-            if result.1 & 1 != 0 {
-                result.0 ^= 1 << 63;
-            }
-
-            result.1 >>= 1;
-        }
-
-        result
-    }
+#[allow(clippy::suspicious_arithmetic_impl)]
+impl Mul for U64x2 {
+    type Output = Self;
 
-    fn shuffle(self) -> Self {
-        U64x2(self.1, self.0)
+    /// Computes carryless POLYVAL multiplication over GF(2^128) in constant time.
+    ///
+    /// Method described at:
+    /// <https://www.bearssl.org/constanttime.html#ghash-for-gcm>
+    ///
+    /// POLYVAL multiplication is effectively the little endian equivalent of
+    /// GHASH multiplication, aside from one small detail described here:
+    ///
+    /// <https://crypto.stackexchange.com/questions/66448/how-does-bearssls-gcm-modular-reduction-work/66462#66462>
+    ///
+    /// > The product of two bit-reversed 128-bit polynomials yields the
+    /// > bit-reversed result over 255 bits, not 256. The BearSSL code ends up
+    /// > with a 256-bit result in zw[], and that value is shifted by one bit,
+    /// > because of that reversed convention issue. Thus, the code must
+    /// > include a shifting step to put it back where it should
+    ///
+    /// This shift is unnecessary for POLYVAL and has been removed.
+    fn mul(self, rhs: Self) -> Self {
+        let h0 = self.0;
+        let h1 = self.1;
+        let h0r = rev64(h0);
+        let h1r = rev64(h1);
+        let h2 = h0 ^ h1;
+        let h2r = h0r ^ h1r;
+
+        let y0 = rhs.0;
+        let y1 = rhs.1;
+        let y0r = rev64(y0);
+        let y1r = rev64(y1);
+        let y2 = y0 ^ y1;
+        let y2r = y0r ^ y1r;
+        let z0 = bmul64(y0, h0);
+        let z1 = bmul64(y1, h1);
+
+        let mut z2 = bmul64(y2, h2);
+        let mut z0h = bmul64(y0r, h0r);
+        let mut z1h = bmul64(y1r, h1r);
+        let mut z2h = bmul64(y2r, h2r);
+
+        z2 ^= z0 ^ z1;
+        z2h ^= z0h ^ z1h;
+        z0h = rev64(z0h) >> 1;
+        z1h = rev64(z1h) >> 1;
+        z2h = rev64(z2h) >> 1;
+
+        let v0 = z0;
+        let mut v1 = z0h ^ z2;
+        let mut v2 = z1 ^ z2h;
+        let mut v3 = z1h;
+
+        v2 ^= v0 ^ v0 >> 1 ^ v0 >> 2 ^ v0 >> 7;
+        v1 ^= v0 << 63 ^ v0 << 62 ^ v0 << 57;
+        v3 ^= v1 ^ v1 >> 1 ^ v1 >> 2 ^ v1 >> 7;
+        v2 ^= v1 << 63 ^ v1 << 62 ^ v1 << 57;
+
+        U64x2(v2, v3)
     }
+}
 
-    fn shl64(self) -> Self {
-        U64x2(0, self.0)
-    }
+/// Reverse a `u64` in constant time
+fn rev64(mut x: u64) -> u64 {
+    x = ((x & 0x5555_5555_5555_5555) << 1) | ((x >> 1) & 0x5555_5555_5555_5555);
+    x = ((x & 0x3333_3333_3333_3333) << 2) | ((x >> 2) & 0x3333_3333_3333_3333);
+    x = ((x & 0x0f0f_0f0f_0f0f_0f0f) << 4) | ((x >> 4) & 0x0f0f_0f0f_0f0f_0f0f);
+    x = ((x & 0x00ff_00ff_00ff_00ff) << 8) | ((x >> 8) & 0x00ff_00ff_00ff_00ff);
+    x = ((x & 0xffff_0000_ffff) << 16) | ((x >> 16) & 0xffff_0000_ffff);
+    (x << 32) | (x >> 32)
+}
 
-    fn shr64(self) -> Self {
-        U64x2(self.1, 0)
-    }
+/// Carryless integer multiplication with with “holes” (sequences of zeroes) to
+/// avoid carry spilling. When carries do occur, they wind up in a "hole" and
+/// are subsequently masked out of the result.
+fn bmul64(x: u64, y: u64) -> u64 {
+    let x0 = x & 0x1111_1111_1111_1111;
+    let x1 = x & 0x2222_2222_2222_2222;
+    let x2 = x & 0x4444_4444_4444_4444;
+    let x3 = x & 0x8888_8888_8888_8888;
+    let y0 = y & 0x1111_1111_1111_1111;
+    let y1 = y & 0x2222_2222_2222_2222;
+    let y2 = y & 0x4444_4444_4444_4444;
+    let y3 = y & 0x8888_8888_8888_8888;
+
+    let mut z0 =
+        x0.wrapping_mul(y0) ^ x1.wrapping_mul(y3) ^ x2.wrapping_mul(y2) ^ x3.wrapping_mul(y1);
+
+    let mut z1 =
+        x0.wrapping_mul(y1) ^ x1.wrapping_mul(y0) ^ x2.wrapping_mul(y3) ^ x3.wrapping_mul(y2);
+
+    let mut z2 =
+        x0.wrapping_mul(y2) ^ x1.wrapping_mul(y1) ^ x2.wrapping_mul(y0) ^ x3.wrapping_mul(y3);
+
+    let mut z3 =
+        x0.wrapping_mul(y3) ^ x1.wrapping_mul(y2) ^ x2.wrapping_mul(y1) ^ x3.wrapping_mul(y0);
+
+    z0 &= 0x1111_1111_1111_1111;
+    z1 &= 0x2222_2222_2222_2222;
+    z2 &= 0x4444_4444_4444_4444;
+    z3 &= 0x8888_8888_8888_8888;
+
+    z0 | z1 | z2 | z3
 }
diff --git a/polyval/src/lib.rs b/polyval/src/lib.rs
@@ -18,9 +18,8 @@
 //!   - x86(-64) CPU: `target-cpu=sandybridge` or newer
 //!   - SSE2 + SSE4.1: `target-feature=+sse2,+sse4.1`
 //!
-//! An **INSECURE** (variable timing) portable implementation is gated behind
-//! the `insecure-soft` cargo feature. Use of this implementation is
-//! **NOT RECOMMENDED** and may potentially leak the POLYVAL key!
+//! If `RUSTFLAGS` are not provided, this crate will fall back to a much slower
+//! software-only implementation.
 //!
 //! ## Relationship to GHASH
 //!

diff --git a/test_polyval.sh b/test_polyval.sh