Crypto++ 8.7
Free C++ class library of cryptographic schemes
chacha_avx.cpp
1// chacha_avx.cpp - written and placed in the public domain by
2// Jack Lloyd and Jeffrey Walton
3//
4// This source file uses intrinsics and built-ins to gain access to
5// AVX2 instructions. A separate source file is needed because
6// additional CXXFLAGS are required to enable the appropriate
7// instructions sets in some build configurations.
8//
9// AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks
10// to Jack Lloyd and the Botan team for allowing us to use it.
11//
12// Here are some relative numbers for ChaCha8:
13// * Intel Skylake, 3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.
14// * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.
15// * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
16
17#include "pch.h"
18#include "config.h"
19
20#include "chacha.h"
21#include "misc.h"
22
23#if defined(CRYPTOPP_AVX2_AVAILABLE)
24# include <xmmintrin.h>
25# include <emmintrin.h>
26# include <immintrin.h>
27#endif
28
29// Squash MS LNK4221 and libtool warnings
30extern const char CHACHA_AVX_FNAME[] = __FILE__;
31
32// Sun Studio 12.4 OK, 12.5 and 12.6 compile error.
33#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
34# define MAYBE_CONST
35#else
36# define MAYBE_CONST const
37#endif
38
39// VS2017 and global optimization bug. Also see
40// https://github.com/weidai11/cryptopp/issues/649 and
41// https://github.com/weidai11/cryptopp/issues/735. The
42// 649 issue affects AES but it is the same here. The 735
43// issue is ChaCha AVX2 cut-in where it surfaced again.
44#if (_MSC_VER >= 1910) && (_MSC_VER <= 1916)
45# ifndef CRYPTOPP_DEBUG
46# pragma optimize("", off)
47# pragma optimize("ts", on)
48# endif
49#endif
50
51// The data is aligned, but Clang issues warning based on type
52// and not the actual alignment of the variable and data.
53#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
54# pragma GCC diagnostic ignored "-Wcast-align"
55#endif
56
57ANONYMOUS_NAMESPACE_BEGIN
58
59#if (CRYPTOPP_AVX2_AVAILABLE)
60
61template <unsigned int R>
62inline __m256i RotateLeft(const __m256i val)
63{
64 return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
65}
66
67template <>
68inline __m256i RotateLeft<8>(const __m256i val)
69{
70 const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
71 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
72 return _mm256_shuffle_epi8(val, mask);
73}
74
75template <>
76inline __m256i RotateLeft<16>(const __m256i val)
77{
78 const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
79 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
80 return _mm256_shuffle_epi8(val, mask);
81}
82
83#endif // CRYPTOPP_AVX2_AVAILABLE
84
85ANONYMOUS_NAMESPACE_END
86
87NAMESPACE_BEGIN(CryptoPP)
88
89#if (CRYPTOPP_AVX2_AVAILABLE)
90
91void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
92{
93 const __m256i state0 = _mm256_broadcastsi128_si256(
94 _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4)));
95 const __m256i state1 = _mm256_broadcastsi128_si256(
96 _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4)));
97 const __m256i state2 = _mm256_broadcastsi128_si256(
98 _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4)));
99 const __m256i state3 = _mm256_broadcastsi128_si256(
100 _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));
101
102 const word32 C = 0xFFFFFFFFu - state[12];
103 const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4);
104 const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);
105 const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);
106 const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);
107
108 __m256i X0_0 = state0;
109 __m256i X0_1 = state1;
110 __m256i X0_2 = state2;
111 __m256i X0_3 = _mm256_add_epi32(state3, CTR0);
112
113 __m256i X1_0 = state0;
114 __m256i X1_1 = state1;
115 __m256i X1_2 = state2;
116 __m256i X1_3 = _mm256_add_epi32(state3, CTR1);
117
118 __m256i X2_0 = state0;
119 __m256i X2_1 = state1;
120 __m256i X2_2 = state2;
121 __m256i X2_3 = _mm256_add_epi32(state3, CTR2);
122
123 __m256i X3_0 = state0;
124 __m256i X3_1 = state1;
125 __m256i X3_2 = state2;
126 __m256i X3_3 = _mm256_add_epi32(state3, CTR3);
127
128 for (int i = static_cast<int>(rounds); i > 0; i -= 2)
129 {
130 X0_0 = _mm256_add_epi32(X0_0, X0_1);
131 X1_0 = _mm256_add_epi32(X1_0, X1_1);
132 X2_0 = _mm256_add_epi32(X2_0, X2_1);
133 X3_0 = _mm256_add_epi32(X3_0, X3_1);
134
135 X0_3 = _mm256_xor_si256(X0_3, X0_0);
136 X1_3 = _mm256_xor_si256(X1_3, X1_0);
137 X2_3 = _mm256_xor_si256(X2_3, X2_0);
138 X3_3 = _mm256_xor_si256(X3_3, X3_0);
139
140 X0_3 = RotateLeft<16>(X0_3);
141 X1_3 = RotateLeft<16>(X1_3);
142 X2_3 = RotateLeft<16>(X2_3);
143 X3_3 = RotateLeft<16>(X3_3);
144
145 X0_2 = _mm256_add_epi32(X0_2, X0_3);
146 X1_2 = _mm256_add_epi32(X1_2, X1_3);
147 X2_2 = _mm256_add_epi32(X2_2, X2_3);
148 X3_2 = _mm256_add_epi32(X3_2, X3_3);
149
150 X0_1 = _mm256_xor_si256(X0_1, X0_2);
151 X1_1 = _mm256_xor_si256(X1_1, X1_2);
152 X2_1 = _mm256_xor_si256(X2_1, X2_2);
153 X3_1 = _mm256_xor_si256(X3_1, X3_2);
154
155 X0_1 = RotateLeft<12>(X0_1);
156 X1_1 = RotateLeft<12>(X1_1);
157 X2_1 = RotateLeft<12>(X2_1);
158 X3_1 = RotateLeft<12>(X3_1);
159
160 X0_0 = _mm256_add_epi32(X0_0, X0_1);
161 X1_0 = _mm256_add_epi32(X1_0, X1_1);
162 X2_0 = _mm256_add_epi32(X2_0, X2_1);
163 X3_0 = _mm256_add_epi32(X3_0, X3_1);
164
165 X0_3 = _mm256_xor_si256(X0_3, X0_0);
166 X1_3 = _mm256_xor_si256(X1_3, X1_0);
167 X2_3 = _mm256_xor_si256(X2_3, X2_0);
168 X3_3 = _mm256_xor_si256(X3_3, X3_0);
169
170 X0_3 = RotateLeft<8>(X0_3);
171 X1_3 = RotateLeft<8>(X1_3);
172 X2_3 = RotateLeft<8>(X2_3);
173 X3_3 = RotateLeft<8>(X3_3);
174
175 X0_2 = _mm256_add_epi32(X0_2, X0_3);
176 X1_2 = _mm256_add_epi32(X1_2, X1_3);
177 X2_2 = _mm256_add_epi32(X2_2, X2_3);
178 X3_2 = _mm256_add_epi32(X3_2, X3_3);
179
180 X0_1 = _mm256_xor_si256(X0_1, X0_2);
181 X1_1 = _mm256_xor_si256(X1_1, X1_2);
182 X2_1 = _mm256_xor_si256(X2_1, X2_2);
183 X3_1 = _mm256_xor_si256(X3_1, X3_2);
184
185 X0_1 = RotateLeft<7>(X0_1);
186 X1_1 = RotateLeft<7>(X1_1);
187 X2_1 = RotateLeft<7>(X2_1);
188 X3_1 = RotateLeft<7>(X3_1);
189
190 X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
191 X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
192 X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
193
194 X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
195 X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
196 X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
197
198 X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
199 X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
200 X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
201
202 X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
203 X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
204 X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
205
206 X0_0 = _mm256_add_epi32(X0_0, X0_1);
207 X1_0 = _mm256_add_epi32(X1_0, X1_1);
208 X2_0 = _mm256_add_epi32(X2_0, X2_1);
209 X3_0 = _mm256_add_epi32(X3_0, X3_1);
210
211 X0_3 = _mm256_xor_si256(X0_3, X0_0);
212 X1_3 = _mm256_xor_si256(X1_3, X1_0);
213 X2_3 = _mm256_xor_si256(X2_3, X2_0);
214 X3_3 = _mm256_xor_si256(X3_3, X3_0);
215
216 X0_3 = RotateLeft<16>(X0_3);
217 X1_3 = RotateLeft<16>(X1_3);
218 X2_3 = RotateLeft<16>(X2_3);
219 X3_3 = RotateLeft<16>(X3_3);
220
221 X0_2 = _mm256_add_epi32(X0_2, X0_3);
222 X1_2 = _mm256_add_epi32(X1_2, X1_3);
223 X2_2 = _mm256_add_epi32(X2_2, X2_3);
224 X3_2 = _mm256_add_epi32(X3_2, X3_3);
225
226 X0_1 = _mm256_xor_si256(X0_1, X0_2);
227 X1_1 = _mm256_xor_si256(X1_1, X1_2);
228 X2_1 = _mm256_xor_si256(X2_1, X2_2);
229 X3_1 = _mm256_xor_si256(X3_1, X3_2);
230
231 X0_1 = RotateLeft<12>(X0_1);
232 X1_1 = RotateLeft<12>(X1_1);
233 X2_1 = RotateLeft<12>(X2_1);
234 X3_1 = RotateLeft<12>(X3_1);
235
236 X0_0 = _mm256_add_epi32(X0_0, X0_1);
237 X1_0 = _mm256_add_epi32(X1_0, X1_1);
238 X2_0 = _mm256_add_epi32(X2_0, X2_1);
239 X3_0 = _mm256_add_epi32(X3_0, X3_1);
240
241 X0_3 = _mm256_xor_si256(X0_3, X0_0);
242 X1_3 = _mm256_xor_si256(X1_3, X1_0);
243 X2_3 = _mm256_xor_si256(X2_3, X2_0);
244 X3_3 = _mm256_xor_si256(X3_3, X3_0);
245
246 X0_3 = RotateLeft<8>(X0_3);
247 X1_3 = RotateLeft<8>(X1_3);
248 X2_3 = RotateLeft<8>(X2_3);
249 X3_3 = RotateLeft<8>(X3_3);
250
251 X0_2 = _mm256_add_epi32(X0_2, X0_3);
252 X1_2 = _mm256_add_epi32(X1_2, X1_3);
253 X2_2 = _mm256_add_epi32(X2_2, X2_3);
254 X3_2 = _mm256_add_epi32(X3_2, X3_3);
255
256 X0_1 = _mm256_xor_si256(X0_1, X0_2);
257 X1_1 = _mm256_xor_si256(X1_1, X1_2);
258 X2_1 = _mm256_xor_si256(X2_1, X2_2);
259 X3_1 = _mm256_xor_si256(X3_1, X3_2);
260
261 X0_1 = RotateLeft<7>(X0_1);
262 X1_1 = RotateLeft<7>(X1_1);
263 X2_1 = RotateLeft<7>(X2_1);
264 X3_1 = RotateLeft<7>(X3_1);
265
266 X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
267 X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
268 X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
269
270 X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
271 X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
272 X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
273
274 X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
275 X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
276 X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
277
278 X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
279 X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
280 X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
281 }
282
283 X0_0 = _mm256_add_epi32(X0_0, state0);
284 X0_1 = _mm256_add_epi32(X0_1, state1);
285 X0_2 = _mm256_add_epi32(X0_2, state2);
286 X0_3 = _mm256_add_epi32(X0_3, state3);
287 X0_3 = _mm256_add_epi32(X0_3, CTR0);
288
289 X1_0 = _mm256_add_epi32(X1_0, state0);
290 X1_1 = _mm256_add_epi32(X1_1, state1);
291 X1_2 = _mm256_add_epi32(X1_2, state2);
292 X1_3 = _mm256_add_epi32(X1_3, state3);
293 X1_3 = _mm256_add_epi32(X1_3, CTR1);
294
295 X2_0 = _mm256_add_epi32(X2_0, state0);
296 X2_1 = _mm256_add_epi32(X2_1, state1);
297 X2_2 = _mm256_add_epi32(X2_2, state2);
298 X2_3 = _mm256_add_epi32(X2_3, state3);
299 X2_3 = _mm256_add_epi32(X2_3, CTR2);
300
301 X3_0 = _mm256_add_epi32(X3_0, state0);
302 X3_1 = _mm256_add_epi32(X3_1, state1);
303 X3_2 = _mm256_add_epi32(X3_2, state2);
304 X3_3 = _mm256_add_epi32(X3_3, state3);
305 X3_3 = _mm256_add_epi32(X3_3, CTR3);
306
307 if (input)
308 {
309 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
310 _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
311 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+0*32)))));
312 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
313 _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
314 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+1*32)))));
315 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
316 _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
317 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+2*32)))));
318 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
319 _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
320 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+3*32)))));
321 }
322 else
323 {
324 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
325 _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
326 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
327 _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
328 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
329 _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
330 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
331 _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
332 }
333
334 if (input)
335 {
336 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
337 _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
338 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+4*32)))));
339 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
340 _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
341 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+5*32)))));
342 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
343 _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
344 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+6*32)))));
345 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
346 _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
347 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+7*32)))));
348 }
349 else
350 {
351 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
352 _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
353 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
354 _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
355 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
356 _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
357 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
358 _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
359 }
360
361 if (input)
362 {
363 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
364 _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
365 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+8*32)))));
366 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
367 _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
368 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+9*32)))));
369 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
370 _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
371 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+10*32)))));
372 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
373 _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
374 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+11*32)))));
375 }
376 else
377 {
378 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
379 _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
380 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
381 _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
382 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
383 _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
384 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
385 _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
386 }
387
388 if (input)
389 {
390 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
391 _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
392 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+12*32)))));
393 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
394 _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
395 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+13*32)))));
396 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
397 _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
398 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+14*32)))));
399 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
400 _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
401 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+15*32)))));
402 }
403 else
404 {
405 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
406 _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
407 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
408 _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
409 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
410 _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
411 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
412 _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
413 }
414
415 // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
416 _mm256_zeroupper();
417}
418
419#endif // CRYPTOPP_AVX2_AVAILABLE
420
421NAMESPACE_END
#define MAYBE_CONST
SunCC workaround.
Definition: adv_simd.h:590
Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers.
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.