Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Poly1305 AVX2 backend #49

Merged
merged 8 commits into from
Sep 9, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions poly1305/fuzz/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#[macro_use]
extern crate afl;

fn main() {
fuzz!(|data: &[u8]| {
// Use first 32 bytes of data as key.
if data.len() >= 32 {
poly1305::fuzz_avx2((&data[0..32]).into(), &data[32..]);
}
});
}
157 changes: 157 additions & 0 deletions poly1305/src/avx2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
//! AVX2 implementation of the Poly1305 state machine.
// The State struct and its logic was originally derived from Goll and Gueron's AVX2 C
// code:
// [Vectorization of Poly1305 message authentication code](https://ieeexplore.ieee.org/document/7113463)
//
// which was sourced from Bhattacharyya and Sarkar's modified variant:
// [Improved SIMD Implementation of Poly1305](https://eprint.iacr.org/2019/842)
// https://github.com/Sreyosi/Improved-SIMD-Implementation-of-Poly1305
//
// The logic has been extensively rewritten and documented, and several bugs in the
// original C code were fixed.
//
// Note that State only implements the original Goll-Gueron algorithm, not the
// optimisations provided by Bhattacharyya and Sarkar. The latter require the message
// length to be known, which is incompatible with the streaming API of UniversalHash.

use universal_hash::generic_array::GenericArray;

use crate::{Block, Key, Tag, BLOCK_SIZE};

mod helpers;
use self::helpers::*;

const BLOCK_X4_SIZE: usize = BLOCK_SIZE * 4;

#[derive(Clone)]
struct Initialized {
p: Aligned4x130,
m: SpacedMultiplier4x130,
r4: PrecomputedMultiplier,
}

#[derive(Clone)]
pub(crate) struct State {
k: AdditionKey,
r1: PrecomputedMultiplier,
r2: PrecomputedMultiplier,
initialized: Option<Initialized>,
cached_blocks: [u8; BLOCK_X4_SIZE],
num_cached_blocks: usize,
partial_block: Option<Block>,
}

impl State {
/// Initialize Poly1305 state with the given key
pub(crate) fn new(key: &Key) -> Self {
// Prepare addition key and polynomial key.
let (k, r1) = prepare_keys(key);

// Precompute R^2.
let r2 = (r1 * r1).reduce();

State {
k,
r1,
r2: r2.into(),
initialized: None,
cached_blocks: [0u8; BLOCK_X4_SIZE],
num_cached_blocks: 0,
partial_block: None,
}
}

/// Reset internal state
pub(crate) fn reset(&mut self) {
self.initialized = None;
self.num_cached_blocks = 0;
}

pub(crate) fn compute_block(&mut self, block: &Block, partial: bool) {
// We can cache a single partial block.
if partial {
assert!(self.partial_block.is_none());
self.partial_block = Some(*block);
return;
}

self.cached_blocks
[self.num_cached_blocks * BLOCK_SIZE..(self.num_cached_blocks + 1) * BLOCK_SIZE]
.copy_from_slice(block);
if self.num_cached_blocks < 3 {
self.num_cached_blocks += 1;
return;
} else {
self.num_cached_blocks = 0;
}

if let Some(inner) = &mut self.initialized {
// P <-- R^4 * P + blocks
inner.p =
(&inner.p * inner.r4).reduce() + Aligned4x130::from_blocks(&self.cached_blocks[..]);
} else {
// Initialize the polynomial.
let p = Aligned4x130::from_blocks(&self.cached_blocks[..]);

// Initialize the multiplier (used to merge down the polynomial during
// finalization).
let (m, r4) = SpacedMultiplier4x130::new(self.r1, self.r2);

self.initialized = Some(Initialized { p, m, r4 })
}
}

pub(crate) fn finalize(&mut self) -> Tag {
assert!(self.num_cached_blocks < 4);
let mut data = &self.cached_blocks[..];

// T ← R◦T
// P = T_0 + T_1 + T_2 + T_3
let mut p = self
.initialized
.take()
.map(|inner| (inner.p * inner.m).sum().reduce());

if self.num_cached_blocks >= 2 {
// Compute 32 byte block (remaining data < 64 bytes)
let mut c = Aligned2x130::from_blocks(&data[0..BLOCK_SIZE * 2]);
if let Some(p) = p {
c = c + p;
}
p = Some(c.mul_and_sum(self.r1, self.r2).reduce());
data = &data[BLOCK_SIZE * 2..];
self.num_cached_blocks -= 2;
}

if self.num_cached_blocks == 1 {
// Compute 16 byte block (remaining data < 32 bytes)
let mut c = Aligned130::from_block(&data[0..BLOCK_SIZE]);
if let Some(p) = p {
c = c + p;
}
p = Some((c * self.r1).reduce());
self.num_cached_blocks -= 1;
}

if let Some(block) = &self.partial_block {
// Compute last block (remaining data < 16 bytes)
let mut c = Aligned130::from_partial_block(block);
if let Some(p) = p {
c = c + p;
}
p = Some((c * self.r1).reduce());
}

// Compute tag: p + k mod 2^128
let mut tag = GenericArray::<u8, _>::default();
let tag_int = if let Some(p) = p {
self.k + p
} else {
self.k.into()
};
tag_int.write(tag.as_mut_slice());

Tag::new(tag)
}
}
1,984 changes: 1,984 additions & 0 deletions poly1305/src/avx2/helpers.rs

Large diffs are not rendered by default.

153 changes: 153 additions & 0 deletions poly1305/src/fuzz.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
use super::fuzz_avx2;

fn avx2_fuzzer_test_case(data: &[u8]) {
fuzz_avx2(data[0..32].into(), &data[32..]);
}

#[test]
fn crash_0() {
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000000,sig:06,src:000014,op:flip4,pos:11"
));
}

#[test]
fn crash_1() {
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000001,sig:06,src:000006+000014,op:splice,rep:64"
));
}

#[test]
fn crash_2() {
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000002,sig:06,src:000008+000014,op:splice,rep:32"
));
}

#[test]
fn crash_3() {
// This input corresponds to a key of:
// r = 0x0f245bfc0f7fe5fc0fffff3400fb1c2b
// s = 0xffffff000001000040f6fff5ffffffff
//
// and input blocks:
// [0x01ea0010000a00ff108b72ffffffffffff, 0x01ffffffff245b74ff7fe5ffffff0040ff,
// 0x01000a00ff108b7200ff04000002ffffff, 0x01ffffffffffffffffffff0000ffea0010,
// 0x0180ffffffffffffffffffffffe3ffffff, 0x01ffffffffffffffffffffffffffffffff,
// 0x01ffffffffffffffffffdfffff03ffffff, 0x01ffffffffff245b74ff7fe5ffffe4ffff,
// 0x0112118b7d00ffeaffffffffffffffffff, 0x010e40eb10ffffffff1edd7f0010000a00]
//
// When this crash occurred, the software and AVX2 backends would generate the same
// tags given the first seven blocks as input. Given the first eight blocks, the
// following tags were generated:
//
// | tag | low 128 bits of final accumulator
// soft | 0x0004d01b9168ded528a9b541cc461988 - s = 0x0004d11b9167ded4e7b2b54bcc461989
// avx2 | 0x0004d01b9168ded528a9b540cc461988 - s = 0x0004d11b9167ded4e7b2b54acc461989
// difference = 0x0100000000
//
// This discrepancy was due to Unreduced130::reduce (as called during finalization)
// not correctly reducing. During the reduction step, the upper limb's upper bits
// (beyond 2^130) are added into the lower limb multiplied by 5 (for reduction modulo
// 2^130 - 5). This is computed like so:
//
// b = t_4 >> 26
// t_0 += b + (b << 2)
//
// It is possible for the upper limb to be 57+ bits; thus b << 2 can be 33+ bits.
// However, the original reduction code was using _mm256_slli_epi32, which shifts
// packed 32-bit integers; this was causing the upper bits of b to be lost. Switching
// to _mm256_slli_epi64 (correctly treating b as a 64-bit field) solves the problem.
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000003,sig:06,src:000003,op:havoc,rep:64"
));
}

#[test]
fn crash_4() {
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000004,sig:06,src:000022+000005,op:splice,rep:32"
));
}

#[test]
fn crash_5() {
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000005,sig:06,src:000008+000007,op:splice,rep:128"
));
}

#[test]
fn crash_6() {
// This input corresponds to a key of:
// r = 0x04040404040404040404040404040404
// s = 0x0404040403ef04040404040404040404
//
// and input:
// [0x04, 0x04, 0x04, 0xf2]
//
// The input fits into a single short block:
// m = 0x01f2040404
//
// and we should have the following computation:
// tag = ((m * r) % p) + s
// = ((0x01f2040404 * 0x04040404040404040404040404040404) % p) + s
// = (0x7cfdfeffffffffffffffffffffffffff8302010 % ((1 << 130) - 5)) + s
// = 0x1f3f7fc + 0x0404040403ef04040404040404040404
// = 0x0404040403ef04040404040405f7fc00
//
// or in bytes:
// tag = [
// 0x00, 0xfc, 0xf7, 0x05, 0x04, 0x04, 0x04, 0x04,
// 0x04, 0x04, 0xef, 0x03, 0x04, 0x04, 0x04, 0x04,
// ];
//
// The crash was caused by the final modular reduction (in the `addkey` method of the
// Goll-Gueron implementation, and `impl Add<Aligned130> for AdditionKey` here) not
// fully carrying all bits. `Aligned130` is guaranteed to be a 130-bit integer, but is
// not guaranteed to be an integer modulo 2^130 - 5.
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000006,sig:06,src:000005,op:havoc,rep:8"
));
}

#[test]
fn crash_7() {
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000007,sig:06,src:000024+000000,op:splice,rep:64"
));
}

#[test]
fn crash_8() {
// This input corresponds to a key of:
// r = 0x0fff00fc0000000000000000006f91ab
// s = 0xffffffffffffffffffffffffffffffff
//
// and a single input block:
// 0x01d4d4ffffffffffffffffffffffffffff
//
// We should have the following computation:
// tag = ((m * r) % p) + s
// = ((0x01d4d4ffffffffffffffffffffffffffff * 0x0fff00fc0000000000000000006f91ab) % p) + s
// = (0x1d4b7cf881ac00000000000000cc5320bf47ff03ffffffffffffffffff906e55 % ((1 << 130) - 5)) + s
// = 0xe3e65b3aa217000000000000008fd63d + 0xffffffffffffffffffffffffffffffff
// = 0x01e3e65b3aa217000000000000008fd63c mod 128
//
// or in bytes:
// tag = [
// 0x3c, 0xd6, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00,
// 0x00, 0x00, 0x17, 0xa2, 0x3a, 0x5b, 0xe6, 0xe3,
// ];
//
// The crash was caused by the final modular reduction (in the `addkey` method of the
// Goll-Gueron implementation, and `impl Add<Aligned130> for AdditionKey` here). After
// adding s, limbs 0 and 2 have carries, while limb 1 is 0xffffffff. The original
// implementation only carried once, after which limb 1 has a carry, which was then
// discarded. The fix was to always carry three times, to ensure that all potential
// carry bits are carried.
avx2_fuzzer_test_case(include_bytes!(
"fuzz/id:000008,sig:06,src:000019,time:165655+000011,op:splice,rep:128"
));
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
��
Binary file not shown.
Binary file not shown.
73 changes: 71 additions & 2 deletions poly1305/src/lib.rs
Original file line number Diff line number Diff line change
@@ -10,6 +10,9 @@
#![doc(html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo_small.png")]
#![warn(missing_docs, rust_2018_idioms)]

#[cfg(feature = "std")]
extern crate std;

pub use universal_hash;

use universal_hash::{
@@ -18,7 +21,37 @@ use universal_hash::{
NewUniversalHash, UniversalHash,
};

#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2"
))]
mod avx2;
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2"
))]
use avx2::State;

#[cfg(any(
not(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2"
)),
any(fuzzing, test)
))]
mod soft;
#[cfg(not(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2"
)))]
use soft::State;

#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2",
any(fuzzing, test)
))]
mod fuzz;

/// Size of a Poly1305 key
pub const KEY_SIZE: usize = 32;
@@ -43,7 +76,7 @@ pub type Tag = universal_hash::Output<Poly1305>;
/// For this reason it doesn't impl the `crypto_mac::Mac` trait.
#[derive(Clone)]
pub struct Poly1305 {
state: soft::State,
state: State,
}

impl NewUniversalHash for Poly1305 {
@@ -52,7 +85,7 @@ impl NewUniversalHash for Poly1305 {
/// Initialize Poly1305 with the given key
fn new(key: &Key) -> Poly1305 {
Poly1305 {
state: soft::State::new(key),
state: State::new(key),
}
}
}
@@ -96,3 +129,39 @@ impl Poly1305 {
self.state.finalize()
}
}

/// Helper function for fuzzing the AVX2 backend.
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2",
any(fuzzing, test)
))]
pub fn fuzz_avx2(key: &Key, data: &[u8]) {
let mut avx2 = avx2::State::new(key);
let mut soft = soft::State::new(key);

for (_i, chunk) in data.chunks(BLOCK_SIZE).enumerate() {
if chunk.len() == BLOCK_SIZE {
let block = GenericArray::from_slice(chunk);
avx2.compute_block(block, false);
soft.compute_block(block, false);
} else {
let mut block = Block::default();
block[..chunk.len()].copy_from_slice(chunk);
block[chunk.len()] = 1;
avx2.compute_block(&block, true);
soft.compute_block(&block, true);
}

// Check that the same tag would be derived after each chunk.
// We add the chunk number to the assertion for debugging.
// When fuzzing, we skip this check, and just look at the end.
#[cfg(test)]
assert_eq!(
(_i + 1, avx2.clone().finalize().into_bytes()),
(_i + 1, soft.clone().finalize().into_bytes()),
);
}

assert_eq!(avx2.finalize().into_bytes(), soft.finalize().into_bytes());
}