RustCrypto · Sep 1, 2019
diff --git a/‎.travis.yml
+6-4 b/‎.travis.yml
+6-4
diff --git a/‎poly1305/benches/poly1305.rs
+1-1 b/‎poly1305/benches/poly1305.rs
+1-1
diff --git a/‎polyval/Cargo.toml
+2-2 b/‎polyval/Cargo.toml
+2-2
diff --git a/‎polyval/benches/polyval.rs
+1-1 b/‎polyval/benches/polyval.rs
+1-1
diff --git a/‎polyval/src/field.rs
+3-26 b/‎polyval/src/field.rs
+3-26
diff --git a/‎polyval/src/field/backend.rs
+8-17 b/‎polyval/src/field/backend.rs
+8-17
diff --git a/‎polyval/src/field/backend/pclmulqdq.rs
+54-20 b/‎polyval/src/field/backend/pclmulqdq.rs
+54-20
diff --git a/‎polyval/src/field/backend/soft.rs
+125-53 b/‎polyval/src/field/backend/soft.rs
+125-53
diff --git a/‎polyval/src/field/clmul.rs
-55 b/‎polyval/src/field/clmul.rs
-55
diff --git a/‎polyval/src/lib.rs
+3-4 b/‎polyval/src/lib.rs
+3-4
diff --git a/‎test_polyval.sh
-12 b/‎test_polyval.sh
-12
@@ -6,7 +6,7 @@ rust:
   - nightly
 
 script:
-  - cargo test --all --exclude polyval --release
+  - cargo test --all --release
   - cargo test --all --all-features --release
 
 env:
@@ -23,13 +23,15 @@ matrix:
       rust: 1.34.0
       env: {} # clear `-D warnings` above; allow warnings
 
-    # polyval presently needs either RUSTFLAGS or non-default features
+    # Test `polyval` with the PCLMULQDQ-accelerated backend
     - name: "Rust: 1.32.0 (polyval)"
       rust: 1.34.0
-      script: ./test_polyval.sh
+      env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
+      script: cd polyval && cargo --no-default-features --release
     - name: "Rust: stable (polyval)"
       rust: stable
-      script: ./test_polyval.sh
+      env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
+      script: cd polyval && cargo --no-default-features --release
 
     # no_std build
     - name: "Rust: stable (thumbv7em-none-eabihf)"
 
@@ -2,7 +2,7 @@
 
 extern crate test;
 
-use poly1305::{Poly1305, universal_hash::UniversalHash};
+use poly1305::{universal_hash::UniversalHash, Poly1305};
 use test::Bencher;
 
 // TODO(tarcieri): move this into the `universal-hash` crate
 
@@ -22,9 +22,9 @@ zeroize = { version = "0.10", optional = true, default-features = false }
 hex-literal = "0.1"
 
 [features]
-default = []
+default = ["soft"]
 std = ["universal-hash/std"]
-insecure-soft = []
+soft = []
 
 [badges]
 maintenance = { status = "experimental" }
 
@@ -2,7 +2,7 @@
 
 extern crate test;
 
-use polyval::{Polyval, universal_hash::UniversalHash};
+use polyval::{universal_hash::UniversalHash, Polyval};
 use test::Bencher;
 
 // TODO(tarcieri): move this into the `universal-hash` crate
 
@@ -14,8 +14,7 @@
 //!
 //! [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3
 
-pub(crate) mod backend;
-mod clmul;
+pub mod backend;
 
 use self::backend::Backend;
 use core::ops::{Add, Mul};
@@ -26,12 +25,6 @@ pub const FIELD_SIZE: usize = 16;
 /// POLYVAL field element bytestrings (16-bytes)
 pub type Block = [u8; FIELD_SIZE];
 
-/// Mask value used when performing Montgomery fast reduction.
-/// This corresponds to POLYVAL's polynomial with the highest bit unset.
-///
-/// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
-const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;
-
 /// POLYVAL field element.
 #[derive(Copy, Clone)]
 pub struct Element<B: Backend>(B);
@@ -46,17 +39,6 @@ impl<B: Backend> Element<B> {
     pub fn to_bytes(self) -> Block {
         self.0.into()
     }
-
-    /// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
-    /// Algorithm 4: "Montgomery reduction"
-    fn reduce(self) -> Self {
-        let mask = B::from(MASK);
-        let a = mask.clmul(self.0, 0x01);
-        let b = self.0.shuffle() ^ a;
-        let c = mask.clmul(b, 0x01);
-        let d = b.shuffle() ^ c;
-        Element(d)
-    }
 }
 
 impl<B: Backend> Default for Element<B> {
@@ -77,7 +59,7 @@ impl<B: Backend> Add for Element<B> {
     ///
     /// [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3
     fn add(self, rhs: Self) -> Self {
-        Element(self.0 ^ rhs.0)
+        Element(self.0 + rhs.0)
     }
 }
 
@@ -95,12 +77,7 @@ impl<B: Backend> Mul for Element<B> {
     ///
     /// [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3
     fn mul(self, rhs: Self) -> Self {
-        let t1 = self.0.clmul(rhs.0, 0x00);
-        let t2 = self.0.clmul(rhs.0, 0x01);
-        let t3 = self.0.clmul(rhs.0, 0x10);
-        let t4 = self.0.clmul(rhs.0, 0x11);
-        let t5 = t2 ^ t3;
-        Element(t4 ^ t5.shr64()) + Element(t1 ^ t5.shl64()).reduce()
+        Element(self.0 * rhs.0)
     }
 }
 
 
@@ -8,12 +8,11 @@
 ))]
 mod pclmulqdq;
 
-#[cfg(feature = "insecure-soft")]
-mod soft;
+#[cfg(feature = "soft")]
+pub mod soft;
 
-use super::clmul::Clmul;
 use super::Block;
-use core::ops::BitXor;
+use core::ops::{Add, Mul};
 
 #[cfg(not(any(
     all(
@@ -22,12 +21,12 @@ use core::ops::BitXor;
         target_feature = "sse4.1",
         any(target_arch = "x86", target_arch = "x86_64")
     ),
-    feature = "insecure-soft"
+    feature = "soft"
 )))]
 compile_error!(
     "no backends available! On x86/x86-64 platforms, enable intrinsics with \
      RUSTFLAGS=\"-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1\" or \
-     enable **INSECURE** portable emulation with the `insecure-soft` feature"
+     enable portable emulation with the `soft` Cargo feature"
 );
 
 #[cfg(all(
@@ -45,20 +44,12 @@ pub(crate) use self::pclmulqdq::M128i;
         target_feature = "sse4.1",
         any(target_arch = "x86", target_arch = "x86_64")
     )),
-    feature = "insecure-soft"
+    feature = "soft"
 ))]
 pub(crate) use self::soft::U64x2 as M128i;
 
-/// Trait representing the arithmetic operations we expect on the XMM registers
+/// Field arithmetic backend
 pub trait Backend:
-    BitXor<Output = Self> + Clmul + Copy + From<Block> + Into<Block> + From<u128>
+    Add<Output = Self> + Mul<Output = Self> + Copy + From<Block> + Into<Block> + From<u128>
 {
-    /// Swap the hi and low 64-bit halves of the register
-    fn shuffle(self) -> Self;
-
-    /// Shift the contents of the register left by 64-bits
-    fn shl64(self) -> Self;
-
-    /// Shift the contents of the register right by 64-bits
-    fn shr64(self) -> Self;
 }
@@ -10,17 +10,16 @@ use core::arch::x86::*;
 use core::arch::x86_64::*;
 
 use super::Backend;
-use crate::field::{
-    clmul::{self, Clmul},
-    Block,
-};
-use core::ops::BitXor;
+use crate::field::Block;
+use core::ops::{Add, Mul};
 
 /// Wrapper for `__m128i` - a 128-bit XMM register (SSE2)
 #[repr(align(16))]
 #[derive(Copy, Clone)]
 pub struct M128i(__m128i);
 
+impl Backend for M128i {}
+
 impl From<Block> for M128i {
     fn from(bytes: Block) -> M128i {
         M128i(unsafe { _mm_loadu_si128(bytes.as_ptr() as *const __m128i) })
@@ -45,24 +44,50 @@ impl From<u128> for M128i {
     }
 }
 
-impl BitXor for M128i {
+impl Add for M128i {
     type Output = Self;
 
-    fn bitxor(self, rhs: Self) -> Self::Output {
+    /// Adds two POLYVAL field elements.
+    fn add(self, rhs: Self) -> Self {
         M128i(unsafe { xor(self.0, rhs.0) })
     }
 }
 
-impl Clmul for M128i {
-    fn clmul<I>(self, rhs: Self, imm: I) -> Self
-    where
-        I: Into<clmul::PseudoOp>,
-    {
-        M128i(unsafe { pclmulqdq(self.0, rhs.0, imm.into()) })
+#[allow(clippy::suspicious_arithmetic_impl)]
+impl Mul for M128i {
+    type Output = Self;
+
+    /// Computes POLYVAL multiplication over GF(2^128).
+    fn mul(self, rhs: Self) -> Self {
+        let t1 = self.clmul(rhs, 0x00);
+        let t2 = self.clmul(rhs, 0x01);
+        let t3 = self.clmul(rhs, 0x10);
+        let t4 = self.clmul(rhs, 0x11);
+        let t5 = t2 + t3;
+        (t4 + t5.shr64()) + (t1 + t5.shl64()).reduce()
     }
 }
 
-impl Backend for M128i {
+impl M128i {
+    /// Wrapper for PCLMULQDQ
+    fn clmul(self, rhs: Self, imm: u8) -> Self {
+        M128i(unsafe { pclmulqdq(self.0, rhs.0, imm) })
+    }
+
+    /// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
+    /// Algorithm 4: "Montgomery reduction"
+    ///
+    /// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
+    fn reduce(self) -> Self {
+        // Mask value used when performing Montgomery fast reduction.
+        // This corresponds to POLYVAL's polynomial with the highest bit unset.
+        let mask = Self::from(1 << 127 | 1 << 126 | 1 << 121 | 1);
+        let a = mask.clmul(self, 0x01);
+        let b = self.shuffle() + a;
+        let c = mask.clmul(b, 0x01);
+        b.shuffle() + c
+    }
+
     fn shuffle(self) -> Self {
         M128i(unsafe { shufpd1(self.0) })
     }
@@ -99,11 +124,20 @@ unsafe fn psrldq8(a: __m128i) -> __m128i {
 
 // TODO(tarcieri): _mm256_clmulepi64_epi128 (vpclmulqdq)
 #[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
-unsafe fn pclmulqdq(a: __m128i, b: __m128i, op: clmul::PseudoOp) -> __m128i {
-    match op {
-        clmul::PseudoOp::PCLMULLQLQDQ => _mm_clmulepi64_si128(a, b, 0x00),
-        clmul::PseudoOp::PCLMULHQLQDQ => _mm_clmulepi64_si128(a, b, 0x01),
-        clmul::PseudoOp::PCLMULLQHQDQ => _mm_clmulepi64_si128(a, b, 0x10),
-        clmul::PseudoOp::PCLMULHQHQDQ => _mm_clmulepi64_si128(a, b, 0x11),
+unsafe fn pclmulqdq(a: __m128i, b: __m128i, imm: u8) -> __m128i {
+    match imm {
+        // Low-Low: `clmul(a[0..8], b[0..8])` (PCLMULLQLQDQ)
+        0x00 => _mm_clmulepi64_si128(a, b, 0x00),
+
+        // High-Low: `clmul(a[8..16], b[0..8])` (PCLMULHQLQDQ)
+        0x01 => _mm_clmulepi64_si128(a, b, 0x01),
+
+        // Low-High: `clmul(a[0..8], b[8..16])` (PCLMULLQHQDQ)
+        0x10 => _mm_clmulepi64_si128(a, b, 0x10),
+
+        // High-High: `clmul(a[8..16], b[8..16])` (PCLMULHQHQDQ)
+        0x11 => _mm_clmulepi64_si128(a, b, 0x11),
+
+        _ => unreachable!(),
     }
 }
@@ -1,27 +1,51 @@
-//! Software emulation support for CLMUL hardware intrinsics.
-//!
-//! WARNING: Not constant time! Should be made constant-time or disabled by default.
-
-// TODO(tarcieri): performance-oriented constant-time implementation
-// See: <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c>
+//! Constant-time software implementation of POLYVAL
+
+// Adapted from BearSSL's `ghash_ctmul64.c`
+// <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c;hb=4b6046412bf927d6424f20fc7ee495bb96dbd227>
+//
+// Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#![allow(missing_docs)]
 
 use super::Backend;
-use crate::field::{
-    clmul::{self, Clmul},
-    Block,
+use crate::field::Block;
+use core::{
+    convert::TryInto,
+    ops::{Add, Mul},
 };
-use core::{convert::TryInto, ops::BitXor};
 
 /// 2 x `u64` values emulating an XMM register
 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
-pub struct U64x2([u64; 2]);
+pub struct U64x2(u64, u64);
+
+impl Backend for U64x2 {}
 
 impl From<Block> for U64x2 {
     fn from(bytes: Block) -> U64x2 {
-        U64x2([
+        U64x2(
             u64::from_le_bytes(bytes[..8].try_into().unwrap()),
             u64::from_le_bytes(bytes[8..].try_into().unwrap()),
-        ])
+        )
     }
 }
 
@@ -36,66 +60,114 @@ impl From<u128> for U64x2 {
     fn from(x: u128) -> U64x2 {
         let lo = (x & 0xFFFF_FFFFF) as u64;
         let hi = (x >> 64) as u64;
-        U64x2([lo, hi])
+        U64x2(lo, hi)
     }
 }
 
 impl From<U64x2> for u128 {
     fn from(u64x2: U64x2) -> u128 {
-        u128::from(u64x2.0[0]) | (u128::from(u64x2.0[1]) << 64)
+        u128::from(u64x2.0) | (u128::from(u64x2.1) << 64)
     }
 }
 
-impl BitXor for U64x2 {
+#[allow(clippy::suspicious_arithmetic_impl)]
+impl Add for U64x2 {
     type Output = Self;
 
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        U64x2([self.0[0] ^ rhs.0[0], self.0[1] ^ rhs.0[1]])
+    /// Adds two POLYVAL field elements.
+    fn add(self, rhs: Self) -> Self::Output {
+        U64x2(self.0 ^ rhs.0, self.1 ^ rhs.1)
     }
 }
 
-impl Clmul for U64x2 {
-    fn clmul<I>(self, other: Self, imm: I) -> Self
-    where
-        I: Into<clmul::PseudoOp>,
-    {
-        let (a, b) = match imm.into() {
-            clmul::PseudoOp::PCLMULLQLQDQ => (self.0[0], other.0[0]),
-            clmul::PseudoOp::PCLMULHQLQDQ => (self.0[1], other.0[0]),
-            clmul::PseudoOp::PCLMULLQHQDQ => (self.0[0], other.0[1]),
-            clmul::PseudoOp::PCLMULHQHQDQ => (self.0[1], other.0[1]),
-        };
+#[allow(clippy::suspicious_arithmetic_impl)]
+impl Mul for U64x2 {
+    type Output = Self;
 
-        let mut result = [0u64; 2];
+    /// Computes POLYVAL multiplication over GF(2^128).
+    // TODO(tarcieri): actually adapt the arithmetic below from GHASH
+    fn mul(self, rhs: Self) -> Self {
+        let h0 = self.0;
+        let h1 = self.1;
+        let h0r = rev64(h0);
+        let h1r = rev64(h1);
+        let h2 = h0 ^ h1;
+        let h2r = h0r ^ h1r;
+
+        let y0 = rhs.0;
+        let y1 = rhs.1;
+        let y0r = rev64(y0);
+        let y1r = rev64(y1);
+        let y2 = y0 ^ y1;
+        let y2r = y0r ^ y1r;
+        let z0 = bmul64(y0, h0);
+        let z1 = bmul64(y1, h1);
+
+        let mut z2 = bmul64(y2, h2);
+        let mut z0h = bmul64(y0r, h0r);
+        let mut z1h = bmul64(y1r, h1r);
+        let mut z2h = bmul64(y2r, h2r);
+
+        z2 ^= z0 ^ z1;
+        z2h ^= z0h ^ z1h;
+        z0h = rev64(z0h) >> 1;
+        z1h = rev64(z1h) >> 1;
+        z2h = rev64(z2h) >> 1;
+
+        let mut v0 = z0;
+        let mut v1 = z0h ^ z2;
+        let mut v2 = z1 ^ z2h;
+        let mut v3 = z1h;
+
+        v3 = v3 << 1 | v2 >> 63;
+        v2 = v2 << 1 | v1 >> 63;
+        v1 = v1 << 1 | v0 >> 63;
+        v0 <<= 1;
+
+        v2 ^= v0 ^ v0 >> 1 ^ v0 >> 2 ^ v0 >> 7;
+        v1 ^= v0 << 63 ^ v0 << 62 ^ v0 << 57;
+        v3 ^= v1 ^ v1 >> 1 ^ v1 >> 2 ^ v1 >> 7;
+        v2 ^= v1 << 63 ^ v1 << 62 ^ v1 << 57;
+
+        U64x2(v2, v3)
+    }
+}
 
-        for i in 0..64 {
-            if b & (1 << i) != 0 {
-                result[1] ^= a;
-            }
+fn rev64(mut x: u64) -> u64 {
+    x = ((x & 0x5555_5555_5555_5555) << 1) | ((x >> 1) & 0x5555_5555_5555_5555);
+    x = ((x & 0x3333_3333_3333_3333) << 2) | ((x >> 2) & 0x3333_3333_3333_3333);
+    x = ((x & 0x0f0f_0f0f_0f0f_0f0f) << 4) | ((x >> 4) & 0x0f0f_0f0f_0f0f_0f0f);
+    x = ((x & 0x00ff_00ff_00ff_00ff) << 8) | ((x >> 8) & 0x00ff_00ff_00ff_00ff);
+    x = ((x & 0xffff_0000_ffff) << 16) | ((x >> 16) & 0xffff_0000_ffff);
+    (x << 32) | (x >> 32)
+}
 
-            result[0] >>= 1;
+fn bmul64(x: u64, y: u64) -> u64 {
+    let x0 = x & 0x1111_1111_1111_1111;
+    let x1 = x & 0x2222_2222_2222_2222;
+    let x2 = x & 0x4444_4444_4444_4444;
+    let x3 = x & 0x8888_8888_8888_8888;
+    let y0 = y & 0x1111_1111_1111_1111;
+    let y1 = y & 0x2222_2222_2222_2222;
+    let y2 = y & 0x4444_4444_4444_4444;
+    let y3 = y & 0x8888_8888_8888_8888;
 
-            if result[1] & 1 != 0 {
-                result[0] ^= 1 << 63;
-            }
+    let mut z0 =
+        x0.wrapping_mul(y0) ^ x1.wrapping_mul(y3) ^ x2.wrapping_mul(y2) ^ x3.wrapping_mul(y1);
 
-            result[1] >>= 1;
-        }
+    let mut z1 =
+        x0.wrapping_mul(y1) ^ x1.wrapping_mul(y0) ^ x2.wrapping_mul(y3) ^ x3.wrapping_mul(y2);
 
-        U64x2(result)
-    }
-}
+    let mut z2 =
+        x0.wrapping_mul(y2) ^ x1.wrapping_mul(y1) ^ x2.wrapping_mul(y0) ^ x3.wrapping_mul(y3);
 
-impl Backend for U64x2 {
-    fn shuffle(self) -> Self {
-        U64x2([self.0[1], self.0[0]])
-    }
+    let mut z3 =
+        x0.wrapping_mul(y3) ^ x1.wrapping_mul(y2) ^ x2.wrapping_mul(y1) ^ x3.wrapping_mul(y0);
 
-    fn shl64(self) -> Self {
-        U64x2([0, self.0[0]])
-    }
+    z0 &= 0x1111_1111_1111_1111;
+    z1 &= 0x2222_2222_2222_2222;
+    z2 &= 0x4444_4444_4444_4444;
+    z3 &= 0x8888_8888_8888_8888;
 
-    fn shr64(self) -> Self {
-        U64x2([self.0[1], 0])
-    }
+    z0 | z1 | z2 | z3
 }
@@ -14,13 +14,12 @@
 //! ## Requirements
 //!
 //! - Rust 1.34.0 or newer
-//! - `RUSTFLAGS` with `-Ctarget-cpu` and `-Ctarget-feature`:
+//! - Recommended: `RUSTFLAGS` with `-Ctarget-cpu` and `-Ctarget-feature`:
 //!   - x86(-64) CPU: `target-cpu=sandybridge` or newer
 //!   - SSE2 + SSE4.1: `target-feature=+sse2,+sse4.1`
 //!
-//! An **INSECURE** (variable timing) portable implementation is gated behind
-//! the `insecure-soft` cargo feature. Use of this implementation is
-//! **NOT RECOMMENDED** and may potentially leak the POLYVAL key!
+//! If `RUSTFLAGS` are not provided, this crate will fall back to a much slower
+//! software-only implementation.
 //!
 //! ## Relationship to GHASH
 //!