Skip to content

Commit 0c84ace

Browse files
authored
polyval: make mul an instance method of the CLMUL implementation (#98)
1 parent a5985cb commit 0c84ace

File tree

1 file changed

+63
-61
lines changed

1 file changed

+63
-61
lines changed

polyval/src/backend/clmul.rs

+63-61
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ impl UniversalHash for Polyval {
4646
#[inline]
4747
fn update(&mut self, x: &Block) {
4848
unsafe {
49-
mul(self, x);
49+
self.mul(x);
5050
}
5151
}
5252

@@ -63,66 +63,68 @@ impl UniversalHash for Polyval {
6363
}
6464
}
6565

66-
#[inline]
67-
#[target_feature(enable = "pclmulqdq")]
68-
#[target_feature(enable = "sse4.1")]
69-
unsafe fn mul(polyval: &mut Polyval, x: &Block) {
70-
let h = polyval.h;
71-
72-
// `_mm_loadu_si128` performs an unaligned load
73-
#[allow(clippy::cast_ptr_alignment)]
74-
let x = _mm_loadu_si128(x.as_ptr() as *const __m128i);
75-
let y = _mm_xor_si128(polyval.y, x);
76-
77-
let h0 = h;
78-
let h1 = _mm_shuffle_epi32(h, 0x0E);
79-
let h2 = _mm_xor_si128(h0, h1);
80-
let y0 = y;
81-
82-
// Multiply values partitioned to 64-bit parts
83-
let y1 = _mm_shuffle_epi32(y, 0x0E);
84-
let y2 = _mm_xor_si128(y0, y1);
85-
let t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
86-
let t1 = _mm_clmulepi64_si128(y, h, 0x11);
87-
let t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
88-
let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
89-
let v0 = t0;
90-
let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
91-
let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
92-
let v3 = _mm_shuffle_epi32(t1, 0x0E);
93-
94-
// Polynomial reduction
95-
let v2 = xor5(
96-
v2,
97-
v0,
98-
_mm_srli_epi64(v0, 1),
99-
_mm_srli_epi64(v0, 2),
100-
_mm_srli_epi64(v0, 7),
101-
);
102-
103-
let v1 = xor4(
104-
v1,
105-
_mm_slli_epi64(v0, 63),
106-
_mm_slli_epi64(v0, 62),
107-
_mm_slli_epi64(v0, 57),
108-
);
109-
110-
let v3 = xor5(
111-
v3,
112-
v1,
113-
_mm_srli_epi64(v1, 1),
114-
_mm_srli_epi64(v1, 2),
115-
_mm_srli_epi64(v1, 7),
116-
);
117-
118-
let v2 = xor4(
119-
v2,
120-
_mm_slli_epi64(v1, 63),
121-
_mm_slli_epi64(v1, 62),
122-
_mm_slli_epi64(v1, 57),
123-
);
124-
125-
polyval.y = _mm_unpacklo_epi64(v2, v3);
66+
impl Polyval {
67+
#[inline]
68+
#[target_feature(enable = "pclmulqdq")]
69+
#[target_feature(enable = "sse4.1")]
70+
unsafe fn mul(&mut self, x: &Block) {
71+
let h = self.h;
72+
73+
// `_mm_loadu_si128` performs an unaligned load
74+
#[allow(clippy::cast_ptr_alignment)]
75+
let x = _mm_loadu_si128(x.as_ptr() as *const __m128i);
76+
let y = _mm_xor_si128(self.y, x);
77+
78+
let h0 = h;
79+
let h1 = _mm_shuffle_epi32(h, 0x0E);
80+
let h2 = _mm_xor_si128(h0, h1);
81+
let y0 = y;
82+
83+
// Multiply values partitioned to 64-bit parts
84+
let y1 = _mm_shuffle_epi32(y, 0x0E);
85+
let y2 = _mm_xor_si128(y0, y1);
86+
let t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
87+
let t1 = _mm_clmulepi64_si128(y, h, 0x11);
88+
let t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
89+
let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
90+
let v0 = t0;
91+
let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
92+
let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
93+
let v3 = _mm_shuffle_epi32(t1, 0x0E);
94+
95+
// Polynomial reduction
96+
let v2 = xor5(
97+
v2,
98+
v0,
99+
_mm_srli_epi64(v0, 1),
100+
_mm_srli_epi64(v0, 2),
101+
_mm_srli_epi64(v0, 7),
102+
);
103+
104+
let v1 = xor4(
105+
v1,
106+
_mm_slli_epi64(v0, 63),
107+
_mm_slli_epi64(v0, 62),
108+
_mm_slli_epi64(v0, 57),
109+
);
110+
111+
let v3 = xor5(
112+
v3,
113+
v1,
114+
_mm_srli_epi64(v1, 1),
115+
_mm_srli_epi64(v1, 2),
116+
_mm_srli_epi64(v1, 7),
117+
);
118+
119+
let v2 = xor4(
120+
v2,
121+
_mm_slli_epi64(v1, 63),
122+
_mm_slli_epi64(v1, 62),
123+
_mm_slli_epi64(v1, 57),
124+
);
125+
126+
self.y = _mm_unpacklo_epi64(v2, v3);
127+
}
126128
}
127129

128130
#[inline(always)]

0 commit comments

Comments
 (0)