Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

polyval: Constant-time software implementation #7

Merged
merged 1 commit into from
Sep 3, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 13 additions & 17 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ rust:
- nightly

script:
- cargo test --all --exclude polyval --release
- cargo test --all --release
- cargo test --all --all-features --release

env:
@@ -23,36 +23,32 @@ matrix:
rust: 1.34.0
env: {} # clear `-D warnings` above; allow warnings

# polyval presently needs either RUSTFLAGS or non-default features
- name: "Rust: 1.32.0 (polyval)"
# Test `polyval` with the PCLMULQDQ-accelerated backend
- name: "Rust: 1.34.0 (polyval)"
rust: 1.34.0
script: ./test_polyval.sh
env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
script: cd polyval && cargo test --release --tests
- name: "Rust: stable (polyval)"
rust: stable
script: ./test_polyval.sh
env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
script: cd polyval && cargo test --release --tests

# no_std build
- name: "Rust: stable (thumbv7em-none-eabihf)"
rust: stable
install:
- rustup target add thumbv7em-none-eabihf
script:
- cargo build --all --exclude polyval --target thumbv7em-none-eabihf --release
install: rustup target add thumbv7em-none-eabihf
script: cargo build --all --target thumbv7em-none-eabihf --release
- name "Rust: nightly (benches)"
rust: nightly
script: cargo build --all-features --benches
- name: rustfmt
rust: stable
install:
- rustup component add rustfmt
script:
- cargo fmt --all -- --check
install: rustup component add rustfmt
script: cargo fmt --all -- --check
- name: clippy
rust: stable
install:
- rustup component add clippy
script:
- cargo clippy --all
install: rustup component add clippy
script: cargo clippy --all

branches:
only:
1 change: 0 additions & 1 deletion poly1305/Cargo.toml
Original file line number Diff line number Diff line change
@@ -16,7 +16,6 @@ universal-hash = { version = "0.2", default-features = false }
zeroize = { version = "0.10", optional = true, default-features = false }

[features]
default = []
std = ["universal-hash/std"]

[badges]
2 changes: 0 additions & 2 deletions polyval/Cargo.toml
Original file line number Diff line number Diff line change
@@ -22,9 +22,7 @@ zeroize = { version = "0.10", optional = true, default-features = false }
hex-literal = "0.1"

[features]
default = []
std = ["universal-hash/std"]
insecure-soft = []

[badges]
maintenance = { status = "experimental" }
9 changes: 1 addition & 8 deletions polyval/src/field.rs
Original file line number Diff line number Diff line change
@@ -77,14 +77,7 @@ impl<B: Backend> Mul for Element<B> {
///
/// [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3
fn mul(self, rhs: Self) -> Self {
let t1 = self.0.clmul(rhs.0, 0x00);
let t2 = self.0.clmul(rhs.0, 0x01);
let t3 = self.0.clmul(rhs.0, 0x10);
let t4 = self.0.clmul(rhs.0, 0x11);
let t5 = t2 + t3;
let t6 = t4 + t5.shr64();
let t7 = (t1 + t5.shl64()).reduce();
Element(t6 + t7)
Element(self.0 * rhs.0)
}
}

65 changes: 12 additions & 53 deletions polyval/src/field/backend.rs
Original file line number Diff line number Diff line change
@@ -7,27 +7,12 @@
any(target_arch = "x86", target_arch = "x86_64")
))]
mod pclmulqdq;

#[cfg(feature = "insecure-soft")]
pub mod soft;
mod soft;

use super::Block;
use core::ops::Add;
use core::ops::{Add, Mul};

#[cfg(not(any(
all(
target_feature = "pclmulqdq",
target_feature = "sse2",
target_feature = "sse4.1",
any(target_arch = "x86", target_arch = "x86_64")
),
feature = "insecure-soft"
)))]
compile_error!(
"no backends available! On x86/x86-64 platforms, enable intrinsics with \
RUSTFLAGS=\"-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1\" or \
enable **INSECURE** portable emulation with the `insecure-soft` feature"
);
// TODO(tarcieri): runtime selection of PCLMULQDQ based on CPU features

#[cfg(all(
target_feature = "pclmulqdq",
@@ -37,42 +22,16 @@ compile_error!(
))]
pub(crate) use self::pclmulqdq::M128i;

#[cfg(all(
not(all(
target_feature = "pclmulqdq",
target_feature = "sse2",
target_feature = "sse4.1",
any(target_arch = "x86", target_arch = "x86_64")
)),
feature = "insecure-soft"
))]
#[cfg(not(all(
target_feature = "pclmulqdq",
target_feature = "sse2",
target_feature = "sse4.1",
any(target_arch = "x86", target_arch = "x86_64")
)))]
pub(crate) use self::soft::U64x2 as M128i;

/// Field arithmetic backend
pub trait Backend: Add<Output = Self> + Copy + From<Block> + Into<Block> + From<u128> {
/// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
/// Algorithm 4: "Montgomery reduction"
///
/// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
fn reduce(self) -> Self {
// Mask value used when performing Montgomery fast reduction.
// This corresponds to POLYVAL's polynomial with the highest bit unset.
let mask = Self::from(1 << 127 | 1 << 126 | 1 << 121 | 1);
let a = mask.clmul(self, 0x01);
let b = self.shuffle() + a;
let c = mask.clmul(b, 0x01);
b.shuffle() + c
}

/// Carryless multiplication
fn clmul(self, rhs: Self, imm: u8) -> Self;

/// Swap the hi and low 64-bit halves of the register
fn shuffle(self) -> Self;

/// Shift the contents of the register left by 64-bits
fn shl64(self) -> Self;

/// Shift the contents of the register right by 64-bits
fn shr64(self) -> Self;
pub trait Backend:
Copy + Add<Output = Self> + Mul<Output = Self> + From<Block> + Into<Block>
{
}
65 changes: 41 additions & 24 deletions polyval/src/field/backend/pclmulqdq.rs
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
//! Support for the PCLMULQDQ CPU intrinsic on `x86` and `x86_64` target
//! architectures.
// The code below uses `loadu`/`storeu` to support unaligned loads/stores
#![allow(clippy::cast_ptr_alignment)]

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

use super::Backend;
use crate::field::Block;
use core::ops::Add;
use core::ops::{Add, Mul};

/// Wrapper for `__m128i` - a 128-bit XMM register (SSE2)
#[repr(align(16))]
#[derive(Copy, Clone)]
pub struct M128i(__m128i);

impl Backend for M128i {}

impl From<Block> for M128i {
// `_mm_loadu_si128` performs an unaligned load
#[allow(clippy::cast_ptr_alignment)]
fn from(bytes: Block) -> M128i {
M128i(unsafe { _mm_loadu_si128(bytes.as_ptr() as *const __m128i) })
}
}

impl From<M128i> for Block {
// `_mm_storeu_si128` performs an unaligned store
#[allow(clippy::cast_ptr_alignment)]
fn from(xmm: M128i) -> Block {
let mut result = Block::default();

@@ -36,12 +39,6 @@ impl From<M128i> for Block {
}
}

impl From<u128> for M128i {
fn from(x: u128) -> M128i {
M128i(unsafe { _mm_loadu_si128(&x as *const u128 as *const __m128i) })
}
}

impl Add for M128i {
type Output = Self;

@@ -51,23 +48,41 @@ impl Add for M128i {
}
}

impl Backend for M128i {
/// Wrapper for PCLMULQDQ
fn clmul(self, rhs: Self, imm: u8) -> Self {
M128i(unsafe { pclmulqdq(self.0, rhs.0, imm) })
}

fn shuffle(self) -> Self {
M128i(unsafe { shufpd1(self.0) })
}
impl Mul for M128i {
type Output = Self;

fn shl64(self) -> Self {
M128i(unsafe { pslldq8(self.0) })
/// Computes carryless POLYVAL multiplication over GF(2^128).
fn mul(self, rhs: Self) -> Self {
unsafe {
let t1 = pclmulqdq(self.0, rhs.0, 0x00);
let t2 = pclmulqdq(self.0, rhs.0, 0x01);
let t3 = pclmulqdq(self.0, rhs.0, 0x10);
let t4 = pclmulqdq(self.0, rhs.0, 0x11);
let t5 = xor(t2, t3);
let t6 = xor(t4, psrldq8(t5));
let t7 = xor(t1, pslldq8(t5));
M128i(xor(t6, reduce(t7)))
}
}
}

fn shr64(self) -> Self {
M128i(unsafe { psrldq8(self.0) })
}
/// Mask value used when performing Montgomery fast reduction.
/// This corresponds to POLYVAL's polynomial with the highest bit unset.
const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;

/// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
/// Algorithm 4: "Montgomery reduction"
///
/// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
unsafe fn reduce(x: __m128i) -> __m128i {
// `_mm_loadu_si128` performs an unaligned load
// (`u128` is not necessarily aligned to 16-bytes)
#[allow(clippy::cast_ptr_alignment)]
let mask = _mm_loadu_si128(&MASK as *const u128 as *const __m128i);
let a = pclmulqdq(mask, x, 0x01);
let b = xor(shufpd1(x), a);
let c = pclmulqdq(mask, b, 0x01);
xor(shufpd1(b), c)
}

#[target_feature(enable = "sse2", enable = "sse4.1")]
@@ -94,6 +109,8 @@ unsafe fn psrldq8(a: __m128i) -> __m128i {
// TODO(tarcieri): _mm256_clmulepi64_epi128 (vpclmulqdq)
#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
unsafe fn pclmulqdq(a: __m128i, b: __m128i, imm: u8) -> __m128i {
// The `imm` value passed to `_mm_clmulepi64_si128` needs to be a literal
// value since it ends up being encoded into the CPU instruction.
match imm {
// Low-Low: `clmul(a[0..8], b[0..8])` (PCLMULLQLQDQ)
0x00 => _mm_clmulepi64_si128(a, b, 0x00),
166 changes: 114 additions & 52 deletions polyval/src/field/backend/soft.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
//! Software emulation support for CLMUL hardware intrinsics.
//! Constant-time software implementation of POLYVAL
//!
//! WARNING: Not constant time! Should be made constant-time or disabled by default.
// TODO(tarcieri): performance-oriented constant-time implementation
// See: <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c>
//! Adapted from BearSSL's `ghash_ctmul64.c`
//! <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c;hb=4b6046412bf927d6424f20fc7ee495bb96dbd227>
//!
//! Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
use super::Backend;
use crate::field::Block;
use core::{convert::TryInto, ops::Add};
use core::{
convert::TryInto,
ops::{Add, Mul},
};

/// 2 x `u64` values emulating an XMM register
/// 2 x `u64` values
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct U64x2(u64, u64);

impl Backend for U64x2 {}

impl From<Block> for U64x2 {
fn from(bytes: Block) -> U64x2 {
U64x2(
@@ -29,67 +34,124 @@ impl From<U64x2> for Block {
}
}

impl From<u128> for U64x2 {
fn from(x: u128) -> U64x2 {
let lo = (x & 0xFFFF_FFFFF) as u64;
let hi = (x >> 64) as u64;
U64x2(lo, hi)
}
}

impl From<U64x2> for u128 {
fn from(u64x2: U64x2) -> u128 {
u128::from(u64x2.0) | (u128::from(u64x2.1) << 64)
}
}

#[allow(clippy::suspicious_arithmetic_impl)]
impl Add for U64x2 {
type Output = Self;

/// Adds two POLYVAL field elements.
fn add(self, rhs: Self) -> Self {
fn add(self, rhs: Self) -> Self::Output {
U64x2(self.0 ^ rhs.0, self.1 ^ rhs.1)
}
}

impl Backend for U64x2 {
fn clmul(self, other: Self, imm: u8) -> Self {
let (a, b) = match imm.into() {
0x00 => (self.0, other.0),
0x01 => (self.1, other.0),
0x10 => (self.0, other.1),
0x11 => (self.1, other.1),
_ => unreachable!(),
};

let mut result = U64x2(0, 0);

for i in 0..64 {
if b & (1 << i) != 0 {
result.1 ^= a;
}

result.0 >>= 1;

if result.1 & 1 != 0 {
result.0 ^= 1 << 63;
}

result.1 >>= 1;
}

result
}
#[allow(clippy::suspicious_arithmetic_impl)]
impl Mul for U64x2 {
type Output = Self;

fn shuffle(self) -> Self {
U64x2(self.1, self.0)
/// Computes carryless POLYVAL multiplication over GF(2^128) in constant time.
///
/// Method described at:
/// <https://www.bearssl.org/constanttime.html#ghash-for-gcm>
///
/// POLYVAL multiplication is effectively the little endian equivalent of
/// GHASH multiplication, aside from one small detail described here:
///
/// <https://crypto.stackexchange.com/questions/66448/how-does-bearssls-gcm-modular-reduction-work/66462#66462>
///
/// > The product of two bit-reversed 128-bit polynomials yields the
/// > bit-reversed result over 255 bits, not 256. The BearSSL code ends up
/// > with a 256-bit result in zw[], and that value is shifted by one bit,
/// > because of that reversed convention issue. Thus, the code must
/// > include a shifting step to put it back where it should
///
/// This shift is unnecessary for POLYVAL and has been removed.
fn mul(self, rhs: Self) -> Self {
let h0 = self.0;
let h1 = self.1;
let h0r = rev64(h0);
let h1r = rev64(h1);
let h2 = h0 ^ h1;
let h2r = h0r ^ h1r;

let y0 = rhs.0;
let y1 = rhs.1;
let y0r = rev64(y0);
let y1r = rev64(y1);
let y2 = y0 ^ y1;
let y2r = y0r ^ y1r;
let z0 = bmul64(y0, h0);
let z1 = bmul64(y1, h1);

let mut z2 = bmul64(y2, h2);
let mut z0h = bmul64(y0r, h0r);
let mut z1h = bmul64(y1r, h1r);
let mut z2h = bmul64(y2r, h2r);

z2 ^= z0 ^ z1;
z2h ^= z0h ^ z1h;
z0h = rev64(z0h) >> 1;
z1h = rev64(z1h) >> 1;
z2h = rev64(z2h) >> 1;

let v0 = z0;
let mut v1 = z0h ^ z2;
let mut v2 = z1 ^ z2h;
let mut v3 = z1h;

v2 ^= v0 ^ v0 >> 1 ^ v0 >> 2 ^ v0 >> 7;
v1 ^= v0 << 63 ^ v0 << 62 ^ v0 << 57;
v3 ^= v1 ^ v1 >> 1 ^ v1 >> 2 ^ v1 >> 7;
v2 ^= v1 << 63 ^ v1 << 62 ^ v1 << 57;

U64x2(v2, v3)
}
}

fn shl64(self) -> Self {
U64x2(0, self.0)
}
/// Reverse a `u64` in constant time
fn rev64(mut x: u64) -> u64 {
x = ((x & 0x5555_5555_5555_5555) << 1) | ((x >> 1) & 0x5555_5555_5555_5555);
x = ((x & 0x3333_3333_3333_3333) << 2) | ((x >> 2) & 0x3333_3333_3333_3333);
x = ((x & 0x0f0f_0f0f_0f0f_0f0f) << 4) | ((x >> 4) & 0x0f0f_0f0f_0f0f_0f0f);
x = ((x & 0x00ff_00ff_00ff_00ff) << 8) | ((x >> 8) & 0x00ff_00ff_00ff_00ff);
x = ((x & 0xffff_0000_ffff) << 16) | ((x >> 16) & 0xffff_0000_ffff);
(x << 32) | (x >> 32)
}

fn shr64(self) -> Self {
U64x2(self.1, 0)
}
/// Carryless integer multiplication with with “holes” (sequences of zeroes) to
/// avoid carry spilling. When carries do occur, they wind up in a "hole" and
/// are subsequently masked out of the result.
fn bmul64(x: u64, y: u64) -> u64 {
let x0 = x & 0x1111_1111_1111_1111;
let x1 = x & 0x2222_2222_2222_2222;
let x2 = x & 0x4444_4444_4444_4444;
let x3 = x & 0x8888_8888_8888_8888;
let y0 = y & 0x1111_1111_1111_1111;
let y1 = y & 0x2222_2222_2222_2222;
let y2 = y & 0x4444_4444_4444_4444;
let y3 = y & 0x8888_8888_8888_8888;

let mut z0 =
x0.wrapping_mul(y0) ^ x1.wrapping_mul(y3) ^ x2.wrapping_mul(y2) ^ x3.wrapping_mul(y1);

let mut z1 =
x0.wrapping_mul(y1) ^ x1.wrapping_mul(y0) ^ x2.wrapping_mul(y3) ^ x3.wrapping_mul(y2);

let mut z2 =
x0.wrapping_mul(y2) ^ x1.wrapping_mul(y1) ^ x2.wrapping_mul(y0) ^ x3.wrapping_mul(y3);

let mut z3 =
x0.wrapping_mul(y3) ^ x1.wrapping_mul(y2) ^ x2.wrapping_mul(y1) ^ x3.wrapping_mul(y0);

z0 &= 0x1111_1111_1111_1111;
z1 &= 0x2222_2222_2222_2222;
z2 &= 0x4444_4444_4444_4444;
z3 &= 0x8888_8888_8888_8888;

z0 | z1 | z2 | z3
}
5 changes: 2 additions & 3 deletions polyval/src/lib.rs
Original file line number Diff line number Diff line change
@@ -18,9 +18,8 @@
//! - x86(-64) CPU: `target-cpu=sandybridge` or newer
//! - SSE2 + SSE4.1: `target-feature=+sse2,+sse4.1`
//!
//! An **INSECURE** (variable timing) portable implementation is gated behind
//! the `insecure-soft` cargo feature. Use of this implementation is
//! **NOT RECOMMENDED** and may potentially leak the POLYVAL key!
//! If `RUSTFLAGS` are not provided, this crate will fall back to a much slower
//! software-only implementation.
//!
//! ## Relationship to GHASH
//!
12 changes: 0 additions & 12 deletions test_polyval.sh

This file was deleted.