Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 22b9140

Browse files
committedSep 5, 2020
poly1305: AVX2 backend for x86 and x86_64
Originally derived from Goll and Gueron's AVX2 C code. The logic has been extensively rewritten and documented, and several bugs in the original C code were fixed.
1 parent 5cec96e commit 22b9140

File tree

3 files changed

+1971
-2
lines changed

3 files changed

+1971
-2
lines changed
 

‎poly1305/src/avx2.rs

+157
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
//! AVX2 implementation of the Poly1305 state machine.
2+
3+
// The State struct and its logic was originally derived from Goll and Gueron's AVX2 C
4+
// code:
5+
// [Vectorization of Poly1305 message authentication code](https://ieeexplore.ieee.org/document/7113463)
6+
//
7+
// which was sourced from Bhattacharyya and Sarkar's modified variant:
8+
// [Improved SIMD Implementation of Poly1305](https://eprint.iacr.org/2019/842)
9+
// https://github.com/Sreyosi/Improved-SIMD-Implementation-of-Poly1305
10+
//
11+
// The logic has been extensively rewritten and documented, and several bugs in the
12+
// original C code were fixed.
13+
//
14+
// Note that State only implements the original Goll-Gueron algorithm, not the
15+
// optimisations provided by Bhattacharyya and Sarkar. The latter require the message
16+
// length to be known, which is incompatible with the streaming API of UniversalHash.
17+
18+
use universal_hash::generic_array::GenericArray;
19+
20+
use crate::{Block, Key, Tag, BLOCK_SIZE};
21+
22+
mod helpers;
23+
use self::helpers::*;
24+
25+
const BLOCK_X4_SIZE: usize = BLOCK_SIZE * 4;
26+
27+
#[derive(Clone)]
28+
struct Initialized {
29+
p: Aligned4x130,
30+
m: SpacedMultiplier4x130,
31+
r4: PrecomputedMultiplier,
32+
}
33+
34+
#[derive(Clone)]
35+
pub(crate) struct State {
36+
k: AdditionKey,
37+
r1: PrecomputedMultiplier,
38+
r2: PrecomputedMultiplier,
39+
initialized: Option<Initialized>,
40+
cached_blocks: [u8; BLOCK_X4_SIZE],
41+
num_cached_blocks: usize,
42+
partial_block: Option<Block>,
43+
}
44+
45+
impl State {
46+
/// Initialize Poly1305 state with the given key
47+
pub(crate) fn new(key: &Key) -> Self {
48+
// Prepare addition key and polynomial key.
49+
let (k, r1) = prepare_keys(key);
50+
51+
// Precompute R^2.
52+
let r2 = (r1 * r1).reduce();
53+
54+
State {
55+
k,
56+
r1,
57+
r2: r2.into(),
58+
initialized: None,
59+
cached_blocks: [0u8; BLOCK_X4_SIZE],
60+
num_cached_blocks: 0,
61+
partial_block: None,
62+
}
63+
}
64+
65+
/// Reset internal state
66+
pub(crate) fn reset(&mut self) {
67+
self.initialized = None;
68+
self.num_cached_blocks = 0;
69+
}
70+
71+
pub(crate) fn compute_block(&mut self, block: &Block, partial: bool) {
72+
// We can cache a single partial block.
73+
if partial {
74+
assert!(self.partial_block.is_none());
75+
self.partial_block = Some(*block);
76+
return;
77+
}
78+
79+
self.cached_blocks
80+
[self.num_cached_blocks * BLOCK_SIZE..(self.num_cached_blocks + 1) * BLOCK_SIZE]
81+
.copy_from_slice(block);
82+
if self.num_cached_blocks < 3 {
83+
self.num_cached_blocks += 1;
84+
return;
85+
} else {
86+
self.num_cached_blocks = 0;
87+
}
88+
89+
if let Some(inner) = &mut self.initialized {
90+
// P <-- R^4 * P + blocks
91+
inner.p =
92+
(&inner.p * inner.r4).reduce() + Aligned4x130::from_blocks(&self.cached_blocks[..]);
93+
} else {
94+
// Initialize the polynomial.
95+
let p = Aligned4x130::from_blocks(&self.cached_blocks[..]);
96+
97+
// Initialize the multiplier (used to merge down the polynomial during
98+
// finalization).
99+
let (m, r4) = SpacedMultiplier4x130::new(self.r1, self.r2);
100+
101+
self.initialized = Some(Initialized { p, m, r4 })
102+
}
103+
}
104+
105+
pub(crate) fn finalize(&mut self) -> Tag {
106+
assert!(self.num_cached_blocks < 4);
107+
let mut data = &self.cached_blocks[..];
108+
109+
// T ← R◦T
110+
// P = T_0 + T_1 + T_2 + T_3
111+
let mut p = self
112+
.initialized
113+
.take()
114+
.map(|inner| (inner.p * inner.m).sum().reduce());
115+
116+
if self.num_cached_blocks >= 2 {
117+
// Compute 32 byte block (remaining data < 64 bytes)
118+
let mut c = Aligned2x130::from_blocks(&data[0..BLOCK_SIZE * 2]);
119+
if let Some(p) = p {
120+
c = c + p;
121+
}
122+
p = Some(c.mul_and_sum(self.r1, self.r2).reduce());
123+
data = &data[BLOCK_SIZE * 2..];
124+
self.num_cached_blocks -= 2;
125+
}
126+
127+
if self.num_cached_blocks == 1 {
128+
// Compute 16 byte block (remaining data < 32 bytes)
129+
let mut c = Aligned130::from_block(&data[0..BLOCK_SIZE]);
130+
if let Some(p) = p {
131+
c = c + p;
132+
}
133+
p = Some((c * self.r1).reduce());
134+
self.num_cached_blocks -= 1;
135+
}
136+
137+
if let Some(block) = &self.partial_block {
138+
// Compute last block (remaining data < 16 bytes)
139+
let mut c = Aligned130::from_partial_block(block);
140+
if let Some(p) = p {
141+
c = c + p;
142+
}
143+
p = Some((c * self.r1).reduce());
144+
}
145+
146+
// Compute tag: p + k mod 2^128
147+
let mut tag = GenericArray::<u8, _>::default();
148+
let tag_int = if let Some(p) = p {
149+
self.k + p
150+
} else {
151+
self.k.into()
152+
};
153+
tag_int.write(tag.as_mut_slice());
154+
155+
Tag::new(tag)
156+
}
157+
}

‎poly1305/src/avx2/helpers.rs

+1,789
Large diffs are not rendered by default.

‎poly1305/src/lib.rs

+25-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
#![doc(html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo_small.png")]
1111
#![warn(missing_docs, rust_2018_idioms)]
1212

13+
#[cfg(feature = "std")]
14+
extern crate std;
15+
1316
pub use universal_hash;
1417

1518
use universal_hash::{
@@ -18,7 +21,27 @@ use universal_hash::{
1821
NewUniversalHash, UniversalHash,
1922
};
2023

24+
#[cfg(all(
25+
any(target_arch = "x86", target_arch = "x86_64"),
26+
target_feature = "avx2"
27+
))]
28+
mod avx2;
29+
#[cfg(all(
30+
any(target_arch = "x86", target_arch = "x86_64"),
31+
target_feature = "avx2"
32+
))]
33+
use avx2::State;
34+
35+
#[cfg(not(all(
36+
any(target_arch = "x86", target_arch = "x86_64"),
37+
target_feature = "avx2"
38+
)))]
2139
mod soft;
40+
#[cfg(not(all(
41+
any(target_arch = "x86", target_arch = "x86_64"),
42+
target_feature = "avx2"
43+
)))]
44+
use soft::State;
2245

2346
/// Size of a Poly1305 key
2447
pub const KEY_SIZE: usize = 32;
@@ -43,7 +66,7 @@ pub type Tag = universal_hash::Output<Poly1305>;
4366
/// For this reason it doesn't impl the `crypto_mac::Mac` trait.
4467
#[derive(Clone)]
4568
pub struct Poly1305 {
46-
state: soft::State,
69+
state: State,
4770
}
4871

4972
impl NewUniversalHash for Poly1305 {
@@ -52,7 +75,7 @@ impl NewUniversalHash for Poly1305 {
5275
/// Initialize Poly1305 with the given key
5376
fn new(key: &Key) -> Poly1305 {
5477
Poly1305 {
55-
state: soft::State::new(key),
78+
state: State::new(key),
5679
}
5780
}
5881
}

0 commit comments

Comments
 (0)
Please sign in to comment.