|
| 1 | +//! AVX2 implementation of the Poly1305 state machine. |
| 2 | +
|
| 3 | +// The State struct and its logic was originally derived from Goll and Gueron's AVX2 C |
| 4 | +// code: |
| 5 | +// [Vectorization of Poly1305 message authentication code](https://ieeexplore.ieee.org/document/7113463) |
| 6 | +// |
| 7 | +// which was sourced from Bhattacharyya and Sarkar's modified variant: |
| 8 | +// [Improved SIMD Implementation of Poly1305](https://eprint.iacr.org/2019/842) |
| 9 | +// https://github.com/Sreyosi/Improved-SIMD-Implementation-of-Poly1305 |
| 10 | +// |
| 11 | +// The logic has been extensively rewritten and documented, and several bugs in the |
| 12 | +// original C code were fixed. |
| 13 | +// |
| 14 | +// Note that State only implements the original Goll-Gueron algorithm, not the |
| 15 | +// optimisations provided by Bhattacharyya and Sarkar. The latter require the message |
| 16 | +// length to be known, which is incompatible with the streaming API of UniversalHash. |
| 17 | + |
| 18 | +use universal_hash::generic_array::GenericArray; |
| 19 | + |
| 20 | +use crate::{Block, Key, Tag, BLOCK_SIZE}; |
| 21 | + |
| 22 | +mod helpers; |
| 23 | +use self::helpers::*; |
| 24 | + |
| 25 | +const BLOCK_X4_SIZE: usize = BLOCK_SIZE * 4; |
| 26 | + |
| 27 | +#[derive(Clone)] |
| 28 | +struct Initialized { |
| 29 | + p: Aligned4x130, |
| 30 | + m: SpacedMultiplier4x130, |
| 31 | + r4: PrecomputedMultiplier, |
| 32 | +} |
| 33 | + |
| 34 | +#[derive(Clone)] |
| 35 | +pub(crate) struct State { |
| 36 | + k: AdditionKey, |
| 37 | + r1: PrecomputedMultiplier, |
| 38 | + r2: PrecomputedMultiplier, |
| 39 | + initialized: Option<Initialized>, |
| 40 | + cached_blocks: [u8; BLOCK_X4_SIZE], |
| 41 | + num_cached_blocks: usize, |
| 42 | + partial_block: Option<Block>, |
| 43 | +} |
| 44 | + |
| 45 | +impl State { |
| 46 | + /// Initialize Poly1305 state with the given key |
| 47 | + pub(crate) fn new(key: &Key) -> Self { |
| 48 | + // Prepare addition key and polynomial key. |
| 49 | + let (k, r1) = prepare_keys(key); |
| 50 | + |
| 51 | + // Precompute R^2. |
| 52 | + let r2 = (r1 * r1).reduce(); |
| 53 | + |
| 54 | + State { |
| 55 | + k, |
| 56 | + r1, |
| 57 | + r2: r2.into(), |
| 58 | + initialized: None, |
| 59 | + cached_blocks: [0u8; BLOCK_X4_SIZE], |
| 60 | + num_cached_blocks: 0, |
| 61 | + partial_block: None, |
| 62 | + } |
| 63 | + } |
| 64 | + |
| 65 | + /// Reset internal state |
| 66 | + pub(crate) fn reset(&mut self) { |
| 67 | + self.initialized = None; |
| 68 | + self.num_cached_blocks = 0; |
| 69 | + } |
| 70 | + |
| 71 | + pub(crate) fn compute_block(&mut self, block: &Block, partial: bool) { |
| 72 | + // We can cache a single partial block. |
| 73 | + if partial { |
| 74 | + assert!(self.partial_block.is_none()); |
| 75 | + self.partial_block = Some(*block); |
| 76 | + return; |
| 77 | + } |
| 78 | + |
| 79 | + self.cached_blocks |
| 80 | + [self.num_cached_blocks * BLOCK_SIZE..(self.num_cached_blocks + 1) * BLOCK_SIZE] |
| 81 | + .copy_from_slice(block); |
| 82 | + if self.num_cached_blocks < 3 { |
| 83 | + self.num_cached_blocks += 1; |
| 84 | + return; |
| 85 | + } else { |
| 86 | + self.num_cached_blocks = 0; |
| 87 | + } |
| 88 | + |
| 89 | + if let Some(inner) = &mut self.initialized { |
| 90 | + // P <-- R^4 * P + blocks |
| 91 | + inner.p = |
| 92 | + (&inner.p * inner.r4).reduce() + Aligned4x130::from_blocks(&self.cached_blocks[..]); |
| 93 | + } else { |
| 94 | + // Initialize the polynomial. |
| 95 | + let p = Aligned4x130::from_blocks(&self.cached_blocks[..]); |
| 96 | + |
| 97 | + // Initialize the multiplier (used to merge down the polynomial during |
| 98 | + // finalization). |
| 99 | + let (m, r4) = SpacedMultiplier4x130::new(self.r1, self.r2); |
| 100 | + |
| 101 | + self.initialized = Some(Initialized { p, m, r4 }) |
| 102 | + } |
| 103 | + } |
| 104 | + |
| 105 | + pub(crate) fn finalize(&mut self) -> Tag { |
| 106 | + assert!(self.num_cached_blocks < 4); |
| 107 | + let mut data = &self.cached_blocks[..]; |
| 108 | + |
| 109 | + // T ← R◦T |
| 110 | + // P = T_0 + T_1 + T_2 + T_3 |
| 111 | + let mut p = self |
| 112 | + .initialized |
| 113 | + .take() |
| 114 | + .map(|inner| (inner.p * inner.m).sum().reduce()); |
| 115 | + |
| 116 | + if self.num_cached_blocks >= 2 { |
| 117 | + // Compute 32 byte block (remaining data < 64 bytes) |
| 118 | + let mut c = Aligned2x130::from_blocks(&data[0..BLOCK_SIZE * 2]); |
| 119 | + if let Some(p) = p { |
| 120 | + c = c + p; |
| 121 | + } |
| 122 | + p = Some(c.mul_and_sum(self.r1, self.r2).reduce()); |
| 123 | + data = &data[BLOCK_SIZE * 2..]; |
| 124 | + self.num_cached_blocks -= 2; |
| 125 | + } |
| 126 | + |
| 127 | + if self.num_cached_blocks == 1 { |
| 128 | + // Compute 16 byte block (remaining data < 32 bytes) |
| 129 | + let mut c = Aligned130::from_block(&data[0..BLOCK_SIZE]); |
| 130 | + if let Some(p) = p { |
| 131 | + c = c + p; |
| 132 | + } |
| 133 | + p = Some((c * self.r1).reduce()); |
| 134 | + self.num_cached_blocks -= 1; |
| 135 | + } |
| 136 | + |
| 137 | + if let Some(block) = &self.partial_block { |
| 138 | + // Compute last block (remaining data < 16 bytes) |
| 139 | + let mut c = Aligned130::from_partial_block(block); |
| 140 | + if let Some(p) = p { |
| 141 | + c = c + p; |
| 142 | + } |
| 143 | + p = Some((c * self.r1).reduce()); |
| 144 | + } |
| 145 | + |
| 146 | + // Compute tag: p + k mod 2^128 |
| 147 | + let mut tag = GenericArray::<u8, _>::default(); |
| 148 | + let tag_int = if let Some(p) = p { |
| 149 | + self.k + p |
| 150 | + } else { |
| 151 | + self.k.into() |
| 152 | + }; |
| 153 | + tag_int.write(tag.as_mut_slice()); |
| 154 | + |
| 155 | + Tag::new(tag) |
| 156 | + } |
| 157 | +} |
0 commit comments