Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 452a2ff

Browse files
committedSep 3, 2019
polyval: Constant-time software implementation
Adapts BearSSL's `ghash_ctmul64.c` into a constant-time software backend for POLYVAL.
1 parent fc963a5 commit 452a2ff

File tree

9 files changed

+183
-172
lines changed

9 files changed

+183
-172
lines changed
 

‎.travis.yml

+13-17
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ rust:
66
- nightly
77

88
script:
9-
- cargo test --all --exclude polyval --release
9+
- cargo test --all --release
1010
- cargo test --all --all-features --release
1111

1212
env:
@@ -23,36 +23,32 @@ matrix:
2323
rust: 1.34.0
2424
env: {} # clear `-D warnings` above; allow warnings
2525

26-
# polyval presently needs either RUSTFLAGS or non-default features
27-
- name: "Rust: 1.32.0 (polyval)"
26+
# Test `polyval` with the PCLMULQDQ-accelerated backend
27+
- name: "Rust: 1.34.0 (polyval)"
2828
rust: 1.34.0
29-
script: ./test_polyval.sh
29+
env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
30+
script: cd polyval && cargo test --release --tests
3031
- name: "Rust: stable (polyval)"
3132
rust: stable
32-
script: ./test_polyval.sh
33+
env: RUSTFLAGS="-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1"
34+
script: cd polyval && cargo test --release --tests
3335

3436
# no_std build
3537
- name: "Rust: stable (thumbv7em-none-eabihf)"
3638
rust: stable
37-
install:
38-
- rustup target add thumbv7em-none-eabihf
39-
script:
40-
- cargo build --all --exclude polyval --target thumbv7em-none-eabihf --release
39+
install: rustup target add thumbv7em-none-eabihf
40+
script: cargo build --all --target thumbv7em-none-eabihf --release
4141
- name "Rust: nightly (benches)"
4242
rust: nightly
4343
script: cargo build --all-features --benches
4444
- name: rustfmt
4545
rust: stable
46-
install:
47-
- rustup component add rustfmt
48-
script:
49-
- cargo fmt --all -- --check
46+
install: rustup component add rustfmt
47+
script: cargo fmt --all -- --check
5048
- name: clippy
5149
rust: stable
52-
install:
53-
- rustup component add clippy
54-
script:
55-
- cargo clippy --all
50+
install: rustup component add clippy
51+
script: cargo clippy --all
5652

5753
branches:
5854
only:

‎poly1305/Cargo.toml

-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ universal-hash = { version = "0.2", default-features = false }
1616
zeroize = { version = "0.10", optional = true, default-features = false }
1717

1818
[features]
19-
default = []
2019
std = ["universal-hash/std"]
2120

2221
[badges]

‎polyval/Cargo.toml

-2
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@ zeroize = { version = "0.10", optional = true, default-features = false }
2222
hex-literal = "0.1"
2323

2424
[features]
25-
default = []
2625
std = ["universal-hash/std"]
27-
insecure-soft = []
2826

2927
[badges]
3028
maintenance = { status = "experimental" }

‎polyval/src/field.rs

+1-8
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,7 @@ impl<B: Backend> Mul for Element<B> {
7777
///
7878
/// [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3
7979
fn mul(self, rhs: Self) -> Self {
80-
let t1 = self.0.clmul(rhs.0, 0x00);
81-
let t2 = self.0.clmul(rhs.0, 0x01);
82-
let t3 = self.0.clmul(rhs.0, 0x10);
83-
let t4 = self.0.clmul(rhs.0, 0x11);
84-
let t5 = t2 + t3;
85-
let t6 = t4 + t5.shr64();
86-
let t7 = (t1 + t5.shl64()).reduce();
87-
Element(t6 + t7)
80+
Element(self.0 * rhs.0)
8881
}
8982
}
9083

‎polyval/src/field/backend.rs

+12-53
Original file line numberDiff line numberDiff line change
@@ -7,27 +7,12 @@
77
any(target_arch = "x86", target_arch = "x86_64")
88
))]
99
mod pclmulqdq;
10-
11-
#[cfg(feature = "insecure-soft")]
12-
pub mod soft;
10+
mod soft;
1311

1412
use super::Block;
15-
use core::ops::Add;
13+
use core::ops::{Add, Mul};
1614

17-
#[cfg(not(any(
18-
all(
19-
target_feature = "pclmulqdq",
20-
target_feature = "sse2",
21-
target_feature = "sse4.1",
22-
any(target_arch = "x86", target_arch = "x86_64")
23-
),
24-
feature = "insecure-soft"
25-
)))]
26-
compile_error!(
27-
"no backends available! On x86/x86-64 platforms, enable intrinsics with \
28-
RUSTFLAGS=\"-Ctarget-cpu=sandybridge -Ctarget-feature=+sse2,+sse4.1\" or \
29-
enable **INSECURE** portable emulation with the `insecure-soft` feature"
30-
);
15+
// TODO(tarcieri): runtime selection of PCLMULQDQ based on CPU features
3116

3217
#[cfg(all(
3318
target_feature = "pclmulqdq",
@@ -37,42 +22,16 @@ compile_error!(
3722
))]
3823
pub(crate) use self::pclmulqdq::M128i;
3924

40-
#[cfg(all(
41-
not(all(
42-
target_feature = "pclmulqdq",
43-
target_feature = "sse2",
44-
target_feature = "sse4.1",
45-
any(target_arch = "x86", target_arch = "x86_64")
46-
)),
47-
feature = "insecure-soft"
48-
))]
25+
#[cfg(not(all(
26+
target_feature = "pclmulqdq",
27+
target_feature = "sse2",
28+
target_feature = "sse4.1",
29+
any(target_arch = "x86", target_arch = "x86_64")
30+
)))]
4931
pub(crate) use self::soft::U64x2 as M128i;
5032

5133
/// Field arithmetic backend
52-
pub trait Backend: Add<Output = Self> + Copy + From<Block> + Into<Block> + From<u128> {
53-
/// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
54-
/// Algorithm 4: "Montgomery reduction"
55-
///
56-
/// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
57-
fn reduce(self) -> Self {
58-
// Mask value used when performing Montgomery fast reduction.
59-
// This corresponds to POLYVAL's polynomial with the highest bit unset.
60-
let mask = Self::from(1 << 127 | 1 << 126 | 1 << 121 | 1);
61-
let a = mask.clmul(self, 0x01);
62-
let b = self.shuffle() + a;
63-
let c = mask.clmul(b, 0x01);
64-
b.shuffle() + c
65-
}
66-
67-
/// Carryless multiplication
68-
fn clmul(self, rhs: Self, imm: u8) -> Self;
69-
70-
/// Swap the hi and low 64-bit halves of the register
71-
fn shuffle(self) -> Self;
72-
73-
/// Shift the contents of the register left by 64-bits
74-
fn shl64(self) -> Self;
75-
76-
/// Shift the contents of the register right by 64-bits
77-
fn shr64(self) -> Self;
34+
pub trait Backend:
35+
Copy + Add<Output = Self> + Mul<Output = Self> + From<Block> + Into<Block>
36+
{
7837
}

‎polyval/src/field/backend/pclmulqdq.rs

+41-24
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,33 @@
11
//! Support for the PCLMULQDQ CPU intrinsic on `x86` and `x86_64` target
22
//! architectures.
33
4-
// The code below uses `loadu`/`storeu` to support unaligned loads/stores
5-
#![allow(clippy::cast_ptr_alignment)]
6-
74
#[cfg(target_arch = "x86")]
85
use core::arch::x86::*;
96
#[cfg(target_arch = "x86_64")]
107
use core::arch::x86_64::*;
118

129
use super::Backend;
1310
use crate::field::Block;
14-
use core::ops::Add;
11+
use core::ops::{Add, Mul};
1512

1613
/// Wrapper for `__m128i` - a 128-bit XMM register (SSE2)
1714
#[repr(align(16))]
1815
#[derive(Copy, Clone)]
1916
pub struct M128i(__m128i);
2017

18+
impl Backend for M128i {}
19+
2120
impl From<Block> for M128i {
21+
// `_mm_loadu_si128` performs an unaligned load
22+
#[allow(clippy::cast_ptr_alignment)]
2223
fn from(bytes: Block) -> M128i {
2324
M128i(unsafe { _mm_loadu_si128(bytes.as_ptr() as *const __m128i) })
2425
}
2526
}
2627

2728
impl From<M128i> for Block {
29+
// `_mm_storeu_si128` performs an unaligned store
30+
#[allow(clippy::cast_ptr_alignment)]
2831
fn from(xmm: M128i) -> Block {
2932
let mut result = Block::default();
3033

@@ -36,12 +39,6 @@ impl From<M128i> for Block {
3639
}
3740
}
3841

39-
impl From<u128> for M128i {
40-
fn from(x: u128) -> M128i {
41-
M128i(unsafe { _mm_loadu_si128(&x as *const u128 as *const __m128i) })
42-
}
43-
}
44-
4542
impl Add for M128i {
4643
type Output = Self;
4744

@@ -51,23 +48,41 @@ impl Add for M128i {
5148
}
5249
}
5350

54-
impl Backend for M128i {
55-
/// Wrapper for PCLMULQDQ
56-
fn clmul(self, rhs: Self, imm: u8) -> Self {
57-
M128i(unsafe { pclmulqdq(self.0, rhs.0, imm) })
58-
}
59-
60-
fn shuffle(self) -> Self {
61-
M128i(unsafe { shufpd1(self.0) })
62-
}
51+
impl Mul for M128i {
52+
type Output = Self;
6353

64-
fn shl64(self) -> Self {
65-
M128i(unsafe { pslldq8(self.0) })
54+
/// Computes carryless POLYVAL multiplication over GF(2^128).
55+
fn mul(self, rhs: Self) -> Self {
56+
unsafe {
57+
let t1 = pclmulqdq(self.0, rhs.0, 0x00);
58+
let t2 = pclmulqdq(self.0, rhs.0, 0x01);
59+
let t3 = pclmulqdq(self.0, rhs.0, 0x10);
60+
let t4 = pclmulqdq(self.0, rhs.0, 0x11);
61+
let t5 = xor(t2, t3);
62+
let t6 = xor(t4, psrldq8(t5));
63+
let t7 = xor(t1, pslldq8(t5));
64+
M128i(xor(t6, reduce(t7)))
65+
}
6666
}
67+
}
6768

68-
fn shr64(self) -> Self {
69-
M128i(unsafe { psrldq8(self.0) })
70-
}
69+
/// Mask value used when performing Montgomery fast reduction.
70+
/// This corresponds to POLYVAL's polynomial with the highest bit unset.
71+
const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;
72+
73+
/// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
74+
/// Algorithm 4: "Montgomery reduction"
75+
///
76+
/// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
77+
unsafe fn reduce(x: __m128i) -> __m128i {
78+
// `_mm_loadu_si128` performs an unaligned load
79+
// (`u128` is not necessarily aligned to 16-bytes)
80+
#[allow(clippy::cast_ptr_alignment)]
81+
let mask = _mm_loadu_si128(&MASK as *const u128 as *const __m128i);
82+
let a = pclmulqdq(mask, x, 0x01);
83+
let b = xor(shufpd1(x), a);
84+
let c = pclmulqdq(mask, b, 0x01);
85+
xor(shufpd1(b), c)
7186
}
7287

7388
#[target_feature(enable = "sse2", enable = "sse4.1")]
@@ -94,6 +109,8 @@ unsafe fn psrldq8(a: __m128i) -> __m128i {
94109
// TODO(tarcieri): _mm256_clmulepi64_epi128 (vpclmulqdq)
95110
#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
96111
unsafe fn pclmulqdq(a: __m128i, b: __m128i, imm: u8) -> __m128i {
112+
// The `imm` value passed to `_mm_clmulepi64_si128` needs to be a literal
113+
// value since it ends up being encoded into the CPU instruction.
97114
match imm {
98115
// Low-Low: `clmul(a[0..8], b[0..8])` (PCLMULLQLQDQ)
99116
0x00 => _mm_clmulepi64_si128(a, b, 0x00),

‎polyval/src/field/backend/soft.rs

+114-52
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
1-
//! Software emulation support for CLMUL hardware intrinsics.
1+
//! Constant-time software implementation of POLYVAL
22
//!
3-
//! WARNING: Not constant time! Should be made constant-time or disabled by default.
4-
5-
// TODO(tarcieri): performance-oriented constant-time implementation
6-
// See: <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c>
3+
//! Adapted from BearSSL's `ghash_ctmul64.c`
4+
//! <https://bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul64.c;hb=4b6046412bf927d6424f20fc7ee495bb96dbd227>
5+
//!
6+
//! Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
77
88
use super::Backend;
99
use crate::field::Block;
10-
use core::{convert::TryInto, ops::Add};
10+
use core::{
11+
convert::TryInto,
12+
ops::{Add, Mul},
13+
};
1114

12-
/// 2 x `u64` values emulating an XMM register
15+
/// 2 x `u64` values
1316
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
1417
pub struct U64x2(u64, u64);
1518

19+
impl Backend for U64x2 {}
20+
1621
impl From<Block> for U64x2 {
1722
fn from(bytes: Block) -> U64x2 {
1823
U64x2(
@@ -29,67 +34,124 @@ impl From<U64x2> for Block {
2934
}
3035
}
3136

32-
impl From<u128> for U64x2 {
33-
fn from(x: u128) -> U64x2 {
34-
let lo = (x & 0xFFFF_FFFFF) as u64;
35-
let hi = (x >> 64) as u64;
36-
U64x2(lo, hi)
37-
}
38-
}
39-
4037
impl From<U64x2> for u128 {
4138
fn from(u64x2: U64x2) -> u128 {
4239
u128::from(u64x2.0) | (u128::from(u64x2.1) << 64)
4340
}
4441
}
4542

43+
#[allow(clippy::suspicious_arithmetic_impl)]
4644
impl Add for U64x2 {
4745
type Output = Self;
4846

4947
/// Adds two POLYVAL field elements.
50-
fn add(self, rhs: Self) -> Self {
48+
fn add(self, rhs: Self) -> Self::Output {
5149
U64x2(self.0 ^ rhs.0, self.1 ^ rhs.1)
5250
}
5351
}
5452

55-
impl Backend for U64x2 {
56-
fn clmul(self, other: Self, imm: u8) -> Self {
57-
let (a, b) = match imm.into() {
58-
0x00 => (self.0, other.0),
59-
0x01 => (self.1, other.0),
60-
0x10 => (self.0, other.1),
61-
0x11 => (self.1, other.1),
62-
_ => unreachable!(),
63-
};
64-
65-
let mut result = U64x2(0, 0);
66-
67-
for i in 0..64 {
68-
if b & (1 << i) != 0 {
69-
result.1 ^= a;
70-
}
71-
72-
result.0 >>= 1;
73-
74-
if result.1 & 1 != 0 {
75-
result.0 ^= 1 << 63;
76-
}
77-
78-
result.1 >>= 1;
79-
}
80-
81-
result
82-
}
53+
#[allow(clippy::suspicious_arithmetic_impl)]
54+
impl Mul for U64x2 {
55+
type Output = Self;
8356

84-
fn shuffle(self) -> Self {
85-
U64x2(self.1, self.0)
57+
/// Computes carryless POLYVAL multiplication over GF(2^128) in constant time.
58+
///
59+
/// Method described at:
60+
/// <https://www.bearssl.org/constanttime.html#ghash-for-gcm>
61+
///
62+
/// POLYVAL multiplication is effectively the little endian equivalent of
63+
/// GHASH multiplication, aside from one small detail described here:
64+
///
65+
/// <https://crypto.stackexchange.com/questions/66448/how-does-bearssls-gcm-modular-reduction-work/66462#66462>
66+
///
67+
/// > The product of two bit-reversed 128-bit polynomials yields the
68+
/// > bit-reversed result over 255 bits, not 256. The BearSSL code ends up
69+
/// > with a 256-bit result in zw[], and that value is shifted by one bit,
70+
/// > because of that reversed convention issue. Thus, the code must
71+
/// > include a shifting step to put it back where it should
72+
///
73+
/// This shift is unnecessary for POLYVAL and has been removed.
74+
fn mul(self, rhs: Self) -> Self {
75+
let h0 = self.0;
76+
let h1 = self.1;
77+
let h0r = rev64(h0);
78+
let h1r = rev64(h1);
79+
let h2 = h0 ^ h1;
80+
let h2r = h0r ^ h1r;
81+
82+
let y0 = rhs.0;
83+
let y1 = rhs.1;
84+
let y0r = rev64(y0);
85+
let y1r = rev64(y1);
86+
let y2 = y0 ^ y1;
87+
let y2r = y0r ^ y1r;
88+
let z0 = bmul64(y0, h0);
89+
let z1 = bmul64(y1, h1);
90+
91+
let mut z2 = bmul64(y2, h2);
92+
let mut z0h = bmul64(y0r, h0r);
93+
let mut z1h = bmul64(y1r, h1r);
94+
let mut z2h = bmul64(y2r, h2r);
95+
96+
z2 ^= z0 ^ z1;
97+
z2h ^= z0h ^ z1h;
98+
z0h = rev64(z0h) >> 1;
99+
z1h = rev64(z1h) >> 1;
100+
z2h = rev64(z2h) >> 1;
101+
102+
let v0 = z0;
103+
let mut v1 = z0h ^ z2;
104+
let mut v2 = z1 ^ z2h;
105+
let mut v3 = z1h;
106+
107+
v2 ^= v0 ^ v0 >> 1 ^ v0 >> 2 ^ v0 >> 7;
108+
v1 ^= v0 << 63 ^ v0 << 62 ^ v0 << 57;
109+
v3 ^= v1 ^ v1 >> 1 ^ v1 >> 2 ^ v1 >> 7;
110+
v2 ^= v1 << 63 ^ v1 << 62 ^ v1 << 57;
111+
112+
U64x2(v2, v3)
86113
}
114+
}
87115

88-
fn shl64(self) -> Self {
89-
U64x2(0, self.0)
90-
}
116+
/// Reverse a `u64` in constant time
117+
fn rev64(mut x: u64) -> u64 {
118+
x = ((x & 0x5555_5555_5555_5555) << 1) | ((x >> 1) & 0x5555_5555_5555_5555);
119+
x = ((x & 0x3333_3333_3333_3333) << 2) | ((x >> 2) & 0x3333_3333_3333_3333);
120+
x = ((x & 0x0f0f_0f0f_0f0f_0f0f) << 4) | ((x >> 4) & 0x0f0f_0f0f_0f0f_0f0f);
121+
x = ((x & 0x00ff_00ff_00ff_00ff) << 8) | ((x >> 8) & 0x00ff_00ff_00ff_00ff);
122+
x = ((x & 0xffff_0000_ffff) << 16) | ((x >> 16) & 0xffff_0000_ffff);
123+
(x << 32) | (x >> 32)
124+
}
91125

92-
fn shr64(self) -> Self {
93-
U64x2(self.1, 0)
94-
}
126+
/// Carryless integer multiplication with with “holes” (sequences of zeroes) to
127+
/// avoid carry spilling. When carries do occur, they wind up in a "hole" and
128+
/// are subsequently masked out of the result.
129+
fn bmul64(x: u64, y: u64) -> u64 {
130+
let x0 = x & 0x1111_1111_1111_1111;
131+
let x1 = x & 0x2222_2222_2222_2222;
132+
let x2 = x & 0x4444_4444_4444_4444;
133+
let x3 = x & 0x8888_8888_8888_8888;
134+
let y0 = y & 0x1111_1111_1111_1111;
135+
let y1 = y & 0x2222_2222_2222_2222;
136+
let y2 = y & 0x4444_4444_4444_4444;
137+
let y3 = y & 0x8888_8888_8888_8888;
138+
139+
let mut z0 =
140+
x0.wrapping_mul(y0) ^ x1.wrapping_mul(y3) ^ x2.wrapping_mul(y2) ^ x3.wrapping_mul(y1);
141+
142+
let mut z1 =
143+
x0.wrapping_mul(y1) ^ x1.wrapping_mul(y0) ^ x2.wrapping_mul(y3) ^ x3.wrapping_mul(y2);
144+
145+
let mut z2 =
146+
x0.wrapping_mul(y2) ^ x1.wrapping_mul(y1) ^ x2.wrapping_mul(y0) ^ x3.wrapping_mul(y3);
147+
148+
let mut z3 =
149+
x0.wrapping_mul(y3) ^ x1.wrapping_mul(y2) ^ x2.wrapping_mul(y1) ^ x3.wrapping_mul(y0);
150+
151+
z0 &= 0x1111_1111_1111_1111;
152+
z1 &= 0x2222_2222_2222_2222;
153+
z2 &= 0x4444_4444_4444_4444;
154+
z3 &= 0x8888_8888_8888_8888;
155+
156+
z0 | z1 | z2 | z3
95157
}

‎polyval/src/lib.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@
1818
//! - x86(-64) CPU: `target-cpu=sandybridge` or newer
1919
//! - SSE2 + SSE4.1: `target-feature=+sse2,+sse4.1`
2020
//!
21-
//! An **INSECURE** (variable timing) portable implementation is gated behind
22-
//! the `insecure-soft` cargo feature. Use of this implementation is
23-
//! **NOT RECOMMENDED** and may potentially leak the POLYVAL key!
21+
//! If `RUSTFLAGS` are not provided, this crate will fall back to a much slower
22+
//! software-only implementation.
2423
//!
2524
//! ## Relationship to GHASH
2625
//!

‎test_polyval.sh

-12
This file was deleted.

0 commit comments

Comments
 (0)
Please sign in to comment.