|
| 1 | +//! Constant-time software implementation of POLYVAL for 32-bit architectures |
| 2 | +//! |
| 3 | +//! Adapted from BearSSL's `ghash_ctmul32.c` |
| 4 | +//! <https://bearssl.org/gitweb/?p=BearSSL;a=blob_plain;f=src/hash/ghash_ctmul32.c;hb=4b6046412bf927d6424f20fc7ee495bb96dbd227> |
| 5 | +//! |
| 6 | +//! Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> |
| 7 | +//! |
| 8 | +//! This implementation uses 32-bit multiplications, and only the low |
| 9 | +//! 32 bits for each multiplication result. This is meant primarily for |
| 10 | +//! the ARM Cortex M0 and M0+, whose multiplication opcode does not yield |
| 11 | +//! the upper 32 bits; but it might also be useful on architectures where |
| 12 | +//! access to the upper 32 bits requires use of specific registers that |
| 13 | +//! create contention (e.g. on i386, "mul" necessarily outputs the result |
| 14 | +//! in edx:eax, while "imul" can use any registers but is limited to the |
| 15 | +//! low 32 bits). |
| 16 | +//! |
| 17 | +//! The implementation trick that is used here is bit-reversing (bit 0 |
| 18 | +//! is swapped with bit 31, bit 1 with bit 30, and so on). In GF(2)[X], |
| 19 | +//! for all values x and y, we have: |
| 20 | +//! |
| 21 | +//! ```text |
| 22 | +//! rev32(x) * rev32(y) = rev64(x * y) |
| 23 | +//! ``` |
| 24 | +//! |
| 25 | +//! In other words, if we bit-reverse (over 32 bits) the operands, then we |
| 26 | +//! bit-reverse (over 64 bits) the result. |
| 27 | +
|
| 28 | +use crate::field::Block; |
| 29 | +use core::{ |
| 30 | + convert::TryInto, |
| 31 | + num::Wrapping, |
| 32 | + ops::{Add, Mul}, |
| 33 | +}; |
| 34 | + |
| 35 | +/// 4 x `u32` values |
| 36 | +#[derive(Copy, Clone, Debug, Eq, PartialEq)] |
| 37 | +pub struct U32x4(u32, u32, u32, u32); |
| 38 | + |
| 39 | +impl From<Block> for U32x4 { |
| 40 | + fn from(bytes: Block) -> U32x4 { |
| 41 | + U32x4( |
| 42 | + u32::from_le_bytes(bytes[..4].try_into().unwrap()), |
| 43 | + u32::from_le_bytes(bytes[4..8].try_into().unwrap()), |
| 44 | + u32::from_le_bytes(bytes[8..12].try_into().unwrap()), |
| 45 | + u32::from_le_bytes(bytes[12..].try_into().unwrap()), |
| 46 | + ) |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +impl From<U32x4> for Block { |
| 51 | + fn from(u32x4: U32x4) -> Block { |
| 52 | + let x: u128 = u32x4.into(); |
| 53 | + x.to_le_bytes() |
| 54 | + } |
| 55 | +} |
| 56 | + |
| 57 | +impl From<U32x4> for u128 { |
| 58 | + fn from(u32x4: U32x4) -> u128 { |
| 59 | + u128::from(u32x4.0) |
| 60 | + | (u128::from(u32x4.1) << 32) |
| 61 | + | (u128::from(u32x4.2) << 64) |
| 62 | + | (u128::from(u32x4.3) << 96) |
| 63 | + } |
| 64 | +} |
| 65 | + |
| 66 | +#[allow(clippy::suspicious_arithmetic_impl)] |
| 67 | +impl Add for U32x4 { |
| 68 | + type Output = Self; |
| 69 | + |
| 70 | + /// Adds two POLYVAL field elements. |
| 71 | + fn add(self, rhs: Self) -> Self::Output { |
| 72 | + U32x4( |
| 73 | + self.0 ^ rhs.0, |
| 74 | + self.1 ^ rhs.1, |
| 75 | + self.2 ^ rhs.2, |
| 76 | + self.3 ^ rhs.3, |
| 77 | + ) |
| 78 | + } |
| 79 | +} |
| 80 | + |
| 81 | +#[allow(clippy::suspicious_arithmetic_impl)] |
| 82 | +impl Mul for U32x4 { |
| 83 | + type Output = Self; |
| 84 | + |
| 85 | + /// Computes carryless POLYVAL multiplication over GF(2^128) in constant time. |
| 86 | + /// |
| 87 | + /// Method described at: |
| 88 | + /// <https://www.bearssl.org/constanttime.html#ghash-for-gcm> |
| 89 | + /// |
| 90 | + /// POLYVAL multiplication is effectively the little endian equivalent of |
| 91 | + /// GHASH multiplication, aside from one small detail described here: |
| 92 | + /// |
| 93 | + /// <https://crypto.stackexchange.com/questions/66448/how-does-bearssls-gcm-modular-reduction-work/66462#66462> |
| 94 | + /// |
| 95 | + /// > The product of two bit-reversed 128-bit polynomials yields the |
| 96 | + /// > bit-reversed result over 255 bits, not 256. The BearSSL code ends up |
| 97 | + /// > with a 256-bit result in zw[], and that value is shifted by one bit, |
| 98 | + /// > because of that reversed convention issue. Thus, the code must |
| 99 | + /// > include a shifting step to put it back where it should |
| 100 | + /// |
| 101 | + /// This shift is unnecessary for POLYVAL and has been removed. |
| 102 | + fn mul(self, rhs: Self) -> Self { |
| 103 | + let hw = [self.0, self.1, self.2, self.3]; |
| 104 | + let yw = [rhs.0, rhs.1, rhs.2, rhs.3]; |
| 105 | + let hwr = [rev32(hw[0]), rev32(hw[1]), rev32(hw[2]), rev32(hw[3])]; |
| 106 | + |
| 107 | + // We are using Karatsuba: the 128x128 multiplication is |
| 108 | + // reduced to three 64x64 multiplications, hence nine |
| 109 | + // 32x32 multiplications. With the bit-reversal trick, |
| 110 | + // we have to perform 18 32x32 multiplications. |
| 111 | + |
| 112 | + let mut a = [0u32; 18]; |
| 113 | + |
| 114 | + a[0] = yw[0]; |
| 115 | + a[1] = yw[1]; |
| 116 | + a[2] = yw[2]; |
| 117 | + a[3] = yw[3]; |
| 118 | + a[4] = a[0] ^ a[1]; |
| 119 | + a[5] = a[2] ^ a[3]; |
| 120 | + a[6] = a[0] ^ a[2]; |
| 121 | + a[7] = a[1] ^ a[3]; |
| 122 | + a[8] = a[6] ^ a[7]; |
| 123 | + a[9] = rev32(yw[0]); |
| 124 | + a[10] = rev32(yw[1]); |
| 125 | + a[11] = rev32(yw[2]); |
| 126 | + a[12] = rev32(yw[3]); |
| 127 | + a[13] = a[9] ^ a[10]; |
| 128 | + a[14] = a[11] ^ a[12]; |
| 129 | + a[15] = a[9] ^ a[11]; |
| 130 | + a[16] = a[10] ^ a[12]; |
| 131 | + a[17] = a[15] ^ a[16]; |
| 132 | + |
| 133 | + let mut b = [0u32; 18]; |
| 134 | + |
| 135 | + b[0] = hw[0]; |
| 136 | + b[1] = hw[1]; |
| 137 | + b[2] = hw[2]; |
| 138 | + b[3] = hw[3]; |
| 139 | + b[4] = b[0] ^ b[1]; |
| 140 | + b[5] = b[2] ^ b[3]; |
| 141 | + b[6] = b[0] ^ b[2]; |
| 142 | + b[7] = b[1] ^ b[3]; |
| 143 | + b[8] = b[6] ^ b[7]; |
| 144 | + b[9] = hwr[0]; |
| 145 | + b[10] = hwr[1]; |
| 146 | + b[11] = hwr[2]; |
| 147 | + b[12] = hwr[3]; |
| 148 | + b[13] = b[9] ^ b[10]; |
| 149 | + b[14] = b[11] ^ b[12]; |
| 150 | + b[15] = b[9] ^ b[11]; |
| 151 | + b[16] = b[10] ^ b[12]; |
| 152 | + b[17] = b[15] ^ b[16]; |
| 153 | + |
| 154 | + let mut c = [0u32; 18]; |
| 155 | + |
| 156 | + for i in 0..18 { |
| 157 | + c[i] = bmul32(a[i], b[i]); |
| 158 | + } |
| 159 | + |
| 160 | + c[4] ^= c[0] ^ c[1]; |
| 161 | + c[5] ^= c[2] ^ c[3]; |
| 162 | + c[8] ^= c[6] ^ c[7]; |
| 163 | + |
| 164 | + c[13] ^= c[9] ^ c[10]; |
| 165 | + c[14] ^= c[11] ^ c[12]; |
| 166 | + c[17] ^= c[15] ^ c[16]; |
| 167 | + |
| 168 | + let mut zw = [0u32; 8]; |
| 169 | + |
| 170 | + zw[0] = c[0]; |
| 171 | + zw[1] = c[4] ^ rev32(c[9]) >> 1; |
| 172 | + zw[2] = c[1] ^ c[0] ^ c[2] ^ c[6] ^ rev32(c[13]) >> 1; |
| 173 | + zw[3] = c[4] ^ c[5] ^ c[8] ^ rev32(c[10] ^ c[9] ^ c[11] ^ c[15]) >> 1; |
| 174 | + zw[4] = c[2] ^ c[1] ^ c[3] ^ c[7] ^ rev32(c[13] ^ c[14] ^ c[17]) >> 1; |
| 175 | + zw[5] = c[5] ^ rev32(c[11] ^ c[10] ^ c[12] ^ c[16]) >> 1; |
| 176 | + zw[6] = c[3] ^ rev32(c[14]) >> 1; |
| 177 | + zw[7] = rev32(c[12]) >> 1; |
| 178 | + |
| 179 | + for i in 0..4 { |
| 180 | + let lw = zw[i]; |
| 181 | + zw[i + 4] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7); |
| 182 | + zw[i + 3] ^= (lw << 31) ^ (lw << 30) ^ (lw << 25); |
| 183 | + } |
| 184 | + |
| 185 | + U32x4(zw[4], zw[5], zw[6], zw[7]) |
| 186 | + } |
| 187 | +} |
| 188 | + |
| 189 | +/// Multiplication in GF(2)[X], truncated to the low 32-bits, with “holes” |
| 190 | +/// (sequences of zeroes) to avoid carry spilling. |
| 191 | +/// |
| 192 | +/// When carries do occur, they wind up in a "hole" and are subsequently masked |
| 193 | +/// out of the result. |
| 194 | +fn bmul32(x: u32, y: u32) -> u32 { |
| 195 | + let x0 = Wrapping(x & 0x1111_1111); |
| 196 | + let x1 = Wrapping(x & 0x2222_2222); |
| 197 | + let x2 = Wrapping(x & 0x4444_4444); |
| 198 | + let x3 = Wrapping(x & 0x8888_8888); |
| 199 | + let y0 = Wrapping(y & 0x1111_1111); |
| 200 | + let y1 = Wrapping(y & 0x2222_2222); |
| 201 | + let y2 = Wrapping(y & 0x4444_4444); |
| 202 | + let y3 = Wrapping(y & 0x8888_8888); |
| 203 | + |
| 204 | + let mut z0 = ((x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1)).0; |
| 205 | + let mut z1 = ((x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2)).0; |
| 206 | + let mut z2 = ((x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3)).0; |
| 207 | + let mut z3 = ((x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0)).0; |
| 208 | + |
| 209 | + z0 &= 0x1111_1111; |
| 210 | + z1 &= 0x2222_2222; |
| 211 | + z2 &= 0x4444_4444; |
| 212 | + z3 &= 0x8888_8888; |
| 213 | + |
| 214 | + z0 | z1 | z2 | z3 |
| 215 | +} |
| 216 | + |
| 217 | +/// Bit-reverse a 32-bit word in constant time. |
| 218 | +fn rev32(mut x: u32) -> u32 { |
| 219 | + x = ((x & 0x5555_5555) << 1) | (x >> 1 & 0x5555_5555); |
| 220 | + x = ((x & 0x3333_3333) << 2) | (x >> 2 & 0x3333_3333); |
| 221 | + x = ((x & 0x0f0f_0f0f) << 4) | (x >> 4 & 0x0f0f_0f0f); |
| 222 | + x = ((x & 0x00ff_00ff) << 8) | (x >> 8 & 0x00ff_00ff); |
| 223 | + (x << 16) | (x >> 16) |
| 224 | +} |
0 commit comments