Crypto++ 8.7
Free C++ class library of cryptographic schemes
rijndael.cpp
1// rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2// and Wei Dai from Paulo Baretto's Rijndael implementation
3// The original code and all modifications are in the public domain.
4
5// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6
7/*
8July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM.
9 See the head notes in aes_armv4.S for copyright and license.
10*/
11
12/*
13September 2017: Added support for Power8 AES instructions via compiler intrinsics.
14*/
15
16/*
17July 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
18*/
19
20/*
21July 2010: Added support for AES-NI instructions via compiler intrinsics.
22*/
23
24/*
25Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
26caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
27and Peter Schwabe in their paper "New AES software speed records". The round
28function was also modified to include a trick similar to one in Brian Gladman's
29x86 assembly code, doing an 8-bit register move to minimize the number of
30register spills. Also switched to compressed tables and copying round keys to
31the stack.
32
33The C++ implementation uses compressed tables if
34CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined.
35It is defined on x86 platforms by default but no others.
36*/
37
38/*
39July 2006: Defense against timing attacks was added in by Wei Dai.
40
41The code now uses smaller tables in the first and last rounds,
42and preloads them into L1 cache before usage (by loading at least
43one element in each cache line).
44
45We try to delay subsequent accesses to each table (used in the first
46and last rounds) until all of the table has been preloaded. Hopefully
47the compiler isn't smart enough to optimize that code away.
48
49After preloading the table, we also try not to access any memory location
50other than the table and the stack, in order to prevent table entries from
51being unloaded from L1 cache, until that round is finished.
52(Some popular CPUs have 2-way associative caches.)
53*/
54
55// This is the original introductory comment:
56
57/**
58 * version 3.0 (December 2000)
59 *
60 * Optimised ANSI C code for the Rijndael cipher (now AES)
61 *
62 * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
63 * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
64 * author Paulo Barreto <paulo.barreto@terra.com.br>
65 *
66 * This code is hereby placed in the public domain.
67 *
68 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
69 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
70 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
71 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
72 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
73 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
74 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
75 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
76 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
77 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
78 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79 */
80
81#include "pch.h"
82#include "config.h"
83
84#ifndef CRYPTOPP_IMPORTS
85#ifndef CRYPTOPP_GENERATE_X64_MASM
86
87#include "rijndael.h"
88#include "misc.h"
89#include "cpu.h"
90
91// VS2017 and global optimization bug. Also see
92// https://github.com/weidai11/cryptopp/issues/649
93#if (_MSC_VER >= 1910) && (_MSC_VER <= 1916)
94# ifndef CRYPTOPP_DEBUG
95# pragma optimize("", off)
96# pragma optimize("ts", on)
97# endif
98#endif
99
100NAMESPACE_BEGIN(CryptoPP)
101
102// Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
103#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE))
104# define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
105#endif
106
107// Clang intrinsic casts
108#define M128I_CAST(x) ((__m128i *)(void *)(x))
109#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
110
111#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
112# if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
113namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
114using namespace rdtable;
115# else
116static word64 Te[256];
117# endif
118static word64 Td[256];
119#else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
120# if defined(CRYPTOPP_X64_MASM_AVAILABLE)
121// Unused; avoids linker error on Microsoft X64 non-AESNI platforms
122namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
123# endif
124CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
125CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
126#endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
127
128static volatile bool s_TeFilled = false, s_TdFilled = false;
129
130ANONYMOUS_NAMESPACE_BEGIN
131
132#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
133
134// Determine whether the range between begin and end overlaps
135// with the same 4k block offsets as the Te table. Logically,
136// the code is trying to create the condition:
137//
138// Two separate memory pages:
139//
140// +-----+ +-----+
141// |XXXXX| |YYYYY|
142// |XXXXX| |YYYYY|
143// | | | |
144// | | | |
145// +-----+ +-----+
146// Te Table Locals
147//
148// Have a logical cache view of (X and Y may be inverted):
149//
150// +-----+
151// |XXXXX|
152// |XXXXX|
153// |YYYYY|
154// |YYYYY|
155// +-----+
156//
157static inline bool AliasedWithTable(const byte *begin, const byte *end)
158{
159 ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
160 ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
161 if (t1 > t0)
162 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
163 else
164 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
165}
166
167struct Locals
168{
169 word32 subkeys[4*12], workspace[8];
170 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
171 byte *outBlocks;
172 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
173 size_t regSpill, lengthAndCounterFlag, keysBegin;
174};
175
176const size_t s_aliasPageSize = 4096;
177const size_t s_aliasBlockSize = 256;
178const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
179
180#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
181
182ANONYMOUS_NAMESPACE_END
183
184// ************************* Portable Code ************************************
185
186#define QUARTER_ROUND(L, T, t, a, b, c, d) \
187 a ^= L(T, 3, byte(t)); t >>= 8;\
188 b ^= L(T, 2, byte(t)); t >>= 8;\
189 c ^= L(T, 1, byte(t)); t >>= 8;\
190 d ^= L(T, 0, t);
191
192#define QUARTER_ROUND_LE(t, a, b, c, d) \
193 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
194 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
195 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
196 tempBlock[d] = ((byte *)(Te+t))[1];
197
198#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
199 #define QUARTER_ROUND_LD(t, a, b, c, d) \
200 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
201 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
202 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
203 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
204#else
205 #define QUARTER_ROUND_LD(t, a, b, c, d) \
206 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
207 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
208 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
209 tempBlock[d] = Sd[t];
210#endif
211
212#define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
213#define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
214
215#if (CRYPTOPP_LITTLE_ENDIAN)
216 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
217 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
218 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
219 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
220 #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
221 #else
222 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
223 #define TL_M(T, i, x) T[i*256 + x]
224 #endif
225#else
226 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
227 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
228 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
229 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
230 #define TL_M TL_F
231 #else
232 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
233 #define TL_M(T, i, x) T[i*256 + x]
234 #endif
235#endif
236
237
238#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
239#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
240#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
241
242#define f3(x) (f2(x) ^ x)
243#define f9(x) (f8(x) ^ x)
244#define fb(x) (f8(x) ^ f2(x) ^ x)
245#define fd(x) (f8(x) ^ f4(x) ^ x)
246#define fe(x) (f8(x) ^ f4(x) ^ f2(x))
247
248unsigned int Rijndael::Base::OptimalDataAlignment() const
249{
250#if (CRYPTOPP_AESNI_AVAILABLE)
251 if (HasAESNI())
252 return 16; // load __m128i
253#endif
254#if (CRYPTOPP_ARM_AES_AVAILABLE)
255 if (HasAES())
256 return 4; // load uint32x4_t
257#endif
258#if (CRYPTOGAMS_ARM_AES)
259 // Must use 1 here for Cryptogams AES. Also see
260 // https://github.com/weidai11/cryptopp/issues/683
261 if (HasARMv7())
262 return 1;
263#endif
264#if (CRYPTOPP_POWER8_AES_AVAILABLE)
265 if (HasAES())
266 return 16; // load uint32x4_p
267#endif
269}
270
271void Rijndael::Base::FillEncTable()
272{
273 for (int i=0; i<256; i++)
274 {
275 byte x = Se[i];
276#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
277 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
278 Te[i] = word64(y | f3(x))<<32 | y;
279#else
280 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
281 for (int j=0; j<4; j++)
282 {
283 Te[i+j*256] = y;
284 y = rotrConstant<8>(y);
285 }
286#endif
287 }
288#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
289 Te[256] = Te[257] = 0;
290#endif
291 s_TeFilled = true;
292}
293
294void Rijndael::Base::FillDecTable()
295{
296 for (int i=0; i<256; i++)
297 {
298 byte x = Sd[i];
299#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
300 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
301 Td[i] = word64(y | fb(x))<<32 | y | x;
302#else
303 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
304 for (int j=0; j<4; j++)
305 {
306 Td[i+j*256] = y;
307 y = rotrConstant<8>(y);
308 }
309#endif
310 }
311 s_TdFilled = true;
312}
313
314#if (CRYPTOPP_AESNI_AVAILABLE)
315extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
316extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
317
318extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
319 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
320extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
321 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
322#endif
323
324#if (CRYPTOPP_ARM_AES_AVAILABLE)
325extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
326 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
327extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
328 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
329#endif
330
331#if (CRYPTOGAMS_ARM_AES)
332extern "C" int cryptogams_AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
333extern "C" int cryptogams_AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
334extern "C" void cryptogams_AES_encrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
335extern "C" void cryptogams_AES_decrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
336#endif
337
338#if (CRYPTOPP_POWER8_AES_AVAILABLE)
339extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
340 word32* rk, const byte* Se);
341
342extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
343 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
344extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
345 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
346#endif
347
348#if (CRYPTOGAMS_ARM_AES)
349int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
350{
351 return cryptogams_AES_set_encrypt_key(userKey, bitLen, rkey);
352}
353int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
354{
355 return cryptogams_AES_set_decrypt_key(userKey, bitLen, rkey);
356}
357void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
358{
359 cryptogams_AES_encrypt_block(inBlock, outBlock, rkey);
360 if (xorBlock)
361 xorbuf (outBlock, xorBlock, 16);
362}
363void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
364{
365 cryptogams_AES_decrypt_block(inBlock, outBlock, rkey);
366 if (xorBlock)
367 xorbuf (outBlock, xorBlock, 16);
368}
369#endif
370
371std::string Rijndael::Base::AlgorithmProvider() const
372{
373#if (CRYPTOPP_AESNI_AVAILABLE)
374 if (HasAESNI())
375 return "AESNI";
376#endif
377#if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
378 if (HasSSE2())
379 return "SSE2";
380#endif
381#if (CRYPTOPP_ARM_AES_AVAILABLE)
382 if (HasAES())
383 return "ARMv8";
384#endif
385#if (CRYPTOGAMS_ARM_AES)
386 if (HasARMv7())
387 return "ARMv7";
388#endif
389#if (CRYPTOPP_POWER8_AES_AVAILABLE)
390 if (HasAES())
391 return "Power8";
392#endif
393 return "C++";
394}
395
396void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
397{
398 AssertValidKeyLength(keyLen);
399
400#if (CRYPTOGAMS_ARM_AES)
401 if (HasARMv7())
402 {
403 m_rounds = keyLen/4 + 6;
404 m_key.New(4*(14+1)+4);
405
406 if (IsForwardTransformation())
407 CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin());
408 else
409 CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin());
410 return;
411 }
412#endif
413
414#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
415 m_aliasBlock.New(s_sizeToAllocate);
416 // The alias block is only used on IA-32 when unaligned data access is in effect.
417 // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
418 m_aliasBlock.SetMark(0);
419#endif
420
421 m_rounds = keyLen/4 + 6;
422 m_key.New(4*(m_rounds+1));
423 word32 *rk = m_key;
424
425#if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
426 // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
427 if (HasAESNI() && HasSSE41())
428 {
429 // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
430 // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
431 Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
432 if (!IsForwardTransformation())
433 Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
434
435 return;
436 }
437#endif
438
439#if CRYPTOPP_POWER8_AES_AVAILABLE
440 if (HasAES())
441 {
442 // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
443 // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
444 Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se);
445 return;
446 }
447#endif
448
449 GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
450 const word32 *rc = rcon;
451 word32 temp;
452
453 while (true)
454 {
455 temp = rk[keyLen/4-1];
456 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
457 (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
458 rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
459 rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
460 rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
461 rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
462
463 if (rk + keyLen/4 + 4 == m_key.end())
464 break;
465
466 if (keyLen == 24)
467 {
468 rk[10] = rk[ 4] ^ rk[ 9];
469 rk[11] = rk[ 5] ^ rk[10];
470 }
471 else if (keyLen == 32)
472 {
473 temp = rk[11];
474 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
475 rk[13] = rk[ 5] ^ rk[12];
476 rk[14] = rk[ 6] ^ rk[13];
477 rk[15] = rk[ 7] ^ rk[14];
478 }
479 rk += keyLen/4;
480 }
481
482 rk = m_key;
483
484 if (IsForwardTransformation())
485 {
486 if (!s_TeFilled)
487 FillEncTable();
488
490 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
491 }
492 else
493 {
494 if (!s_TdFilled)
495 FillDecTable();
496
497 #define InverseMixColumn(x) \
498 TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
499 TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
500
501 unsigned int i, j;
502 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
503 {
504 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
505 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
506 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
507 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
508 }
509
510 rk[i+0] = InverseMixColumn(rk[i+0]);
511 rk[i+1] = InverseMixColumn(rk[i+1]);
512 rk[i+2] = InverseMixColumn(rk[i+2]);
513 rk[i+3] = InverseMixColumn(rk[i+3]);
514
515 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
516 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
517 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
518 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
519 }
520
521#if CRYPTOPP_AESNI_AVAILABLE
522 if (HasAESNI())
523 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
524#endif
525#if CRYPTOPP_ARM_AES_AVAILABLE
526 if (HasAES())
527 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
528#endif
529}
530
531void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
532{
533#if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
534# if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
535 if (HasSSE2())
536# else
537 if (HasAESNI())
538# endif
539 {
540 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
541 return;
542 }
543#endif
544
545#if (CRYPTOPP_ARM_AES_AVAILABLE)
546 if (HasAES())
547 {
548 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
549 return;
550 }
551#endif
552
553#if (CRYPTOGAMS_ARM_AES)
554 if (HasARMv7())
555 {
556 CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin());
557 return;
558 }
559#endif
560
561#if (CRYPTOPP_POWER8_AES_AVAILABLE)
562 if (HasAES())
563 {
564 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
565 return;
566 }
567#endif
568
570
571 word32 s0, s1, s2, s3, t0, t1, t2, t3;
572 Block::Get(inBlock)(s0)(s1)(s2)(s3);
573
574 const word32 *rk = m_key;
575 s0 ^= rk[0];
576 s1 ^= rk[1];
577 s2 ^= rk[2];
578 s3 ^= rk[3];
579 t0 = rk[4];
580 t1 = rk[5];
581 t2 = rk[6];
582 t3 = rk[7];
583 rk += 8;
584
585 // timing attack countermeasure. see comments at top for more details.
586 // also see http://github.com/weidai11/cryptopp/issues/146
587 const int cacheLineSize = GetCacheLineSize();
588 unsigned int i;
589 volatile word32 _u = 0;
590 word32 u = _u;
591#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
592 for (i=0; i<2048; i+=cacheLineSize)
593#else
594 for (i=0; i<1024; i+=cacheLineSize)
595#endif
596 u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
597 u &= Te[255];
598 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
599
600 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
601 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
602 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
603 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
604
605 // Nr - 2 full rounds:
606 unsigned int r = m_rounds/2 - 1;
607 do
608 {
609 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
610
611 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
612 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
613 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
614 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
615
616 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
617
618 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
619 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
620 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
621 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
622
623 rk += 8;
624 } while (--r);
625
626 word32 tbw[4];
627 byte *const tempBlock = (byte *)tbw;
628
629 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
630 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
631 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
632 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
633
634 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
635}
636
637void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
638{
639#if CRYPTOPP_AESNI_AVAILABLE
640 if (HasAESNI())
641 {
642 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
643 return;
644 }
645#endif
646
647#if (CRYPTOPP_ARM_AES_AVAILABLE)
648 if (HasAES())
649 {
650 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
651 return;
652 }
653#endif
654
655#if (CRYPTOGAMS_ARM_AES)
656 if (HasARMv7())
657 {
658 CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin());
659 return;
660 }
661#endif
662
663#if (CRYPTOPP_POWER8_AES_AVAILABLE)
664 if (HasAES())
665 {
666 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
667 return;
668 }
669#endif
670
672
673 word32 s0, s1, s2, s3, t0, t1, t2, t3;
674 Block::Get(inBlock)(s0)(s1)(s2)(s3);
675
676 const word32 *rk = m_key;
677 s0 ^= rk[0];
678 s1 ^= rk[1];
679 s2 ^= rk[2];
680 s3 ^= rk[3];
681 t0 = rk[4];
682 t1 = rk[5];
683 t2 = rk[6];
684 t3 = rk[7];
685 rk += 8;
686
687 // timing attack countermeasure. see comments at top for more details.
688 // also see http://github.com/weidai11/cryptopp/issues/146
689 const int cacheLineSize = GetCacheLineSize();
690 unsigned int i;
691 volatile word32 _u = 0;
692 word32 u = _u;
693#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
694 for (i=0; i<2048; i+=cacheLineSize)
695#else
696 for (i=0; i<1024; i+=cacheLineSize)
697#endif
698 u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
699 u &= Td[255];
700 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
701
702 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
703 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
704 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
705 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
706
707 // Nr - 2 full rounds:
708 unsigned int r = m_rounds/2 - 1;
709 do
710 {
711 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
712
713 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
714 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
715 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
716 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
717
718 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
719
720 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
721 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
722 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
723 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
724
725 rk += 8;
726 } while (--r);
727
728#if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
729 // timing attack countermeasure. see comments at top for more details
730 // If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined,
731 // QUARTER_ROUND_LD will use Td, which is already preloaded.
732 u = _u;
733 for (i=0; i<256; i+=cacheLineSize)
734 u &= *(const word32 *)(const void *)(Sd+i);
735 u &= *(const word32 *)(const void *)(Sd+252);
736 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
737#endif
738
739 word32 tbw[4];
740 byte *const tempBlock = (byte *)tbw;
741
742 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
743 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
744 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
745 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
746
747 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
748}
749
750// ************************* Assembly Code ************************************
751
752#if CRYPTOPP_MSC_VERSION
753# pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
754#endif
755
756#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
757
758#if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
759
760CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k)
761{
762 CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
763
764#if CRYPTOPP_BOOL_X86
765
766#define L_REG esp
767#define L_INDEX(i) (L_REG+768+i)
768#define L_INXORBLOCKS L_INBLOCKS+4
769#define L_OUTXORBLOCKS L_INBLOCKS+8
770#define L_OUTBLOCKS L_INBLOCKS+12
771#define L_INCREMENTS L_INDEX(16*15)
772#define L_SP L_INDEX(16*16)
773#define L_LENGTH L_INDEX(16*16+4)
774#define L_KEYS_BEGIN L_INDEX(16*16+8)
775
776#define MOVD movd
777#define MM(i) mm##i
778
779#define MXOR(a,b,c) \
780 AS2( movzx esi, b)\
781 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
782 AS2( pxor MM(a), mm7)\
783
784#define MMOV(a,b,c) \
785 AS2( movzx esi, b)\
786 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
787
788#else
789
790#define L_REG r8
791#define L_INDEX(i) (L_REG+i)
792#define L_INXORBLOCKS L_INBLOCKS+8
793#define L_OUTXORBLOCKS L_INBLOCKS+16
794#define L_OUTBLOCKS L_INBLOCKS+24
795#define L_INCREMENTS L_INDEX(16*16)
796#define L_LENGTH L_INDEX(16*18+8)
797#define L_KEYS_BEGIN L_INDEX(16*19)
798
799#define MOVD mov
800#define MM_0 r9d
801#define MM_1 r12d
802#ifdef __GNUC__
803#define MM_2 r11d
804#else
805#define MM_2 r10d
806#endif
807#define MM(i) MM_##i
808
809#define MXOR(a,b,c) \
810 AS2( movzx esi, b)\
811 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
812
813#define MMOV(a,b,c) \
814 AS2( movzx esi, b)\
815 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
816
817#endif
818
819#define L_SUBKEYS L_INDEX(0)
820#define L_SAVED_X L_SUBKEYS
821#define L_KEY12 L_INDEX(16*12)
822#define L_LASTROUND L_INDEX(16*13)
823#define L_INBLOCKS L_INDEX(16*14)
824#define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
825
826#define XOR(a,b,c) \
827 AS2( movzx esi, b)\
828 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
829
830#define MOV(a,b,c) \
831 AS2( movzx esi, b)\
832 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
833
834#ifdef CRYPTOPP_GENERATE_X64_MASM
835 ALIGN 8
836 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
837 rex_push_reg rsi
838 push_reg rdi
839 push_reg rbx
840 push_reg r12
841 .endprolog
842 mov L_REG, rcx
843 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
844 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
845#elif defined(__GNUC__)
846 __asm__ __volatile__
847 (
848 INTEL_NOPREFIX
850 AS2( mov L_REG, rcx)
851 #endif
852 AS_PUSH_IF86(bx)
853 AS_PUSH_IF86(bp)
854 AS2( mov AS_REG_7, WORD_REG(si))
855#else
856 AS_PUSH_IF86(si)
857 AS_PUSH_IF86(di)
858 AS_PUSH_IF86(bx)
859 AS_PUSH_IF86(bp)
860 AS2( lea AS_REG_7, [Te])
861 AS2( mov edi, [g_cacheLineSize])
862#endif
863
865 AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
866 AS2( lea esp, [ecx-768])
867#endif
868
869 // copy subkeys to stack
870 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
871 AS2( mov WORD_REG(ax), 16)
872 AS2( and WORD_REG(ax), WORD_REG(si))
873 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
874 AS2( movdqa [L_KEY12], xmm3)
875 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
876 AS2( sub WORD_REG(ax), WORD_REG(si))
877 ASL(0)
878 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
879 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
880 AS2( add WORD_REG(si), 16)
881 AS2( cmp WORD_REG(si), 16*12)
882 ATT_NOPREFIX
883 ASJ( jl, 0, b)
884 INTEL_NOPREFIX
885
886 // read subkeys 0, 1 and last
887 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
888 AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
889 AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
890 AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
891 AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
892 AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
893
894 // load table into cache
895 AS2( xor WORD_REG(ax), WORD_REG(ax))
896 ASL(9)
897 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
898 AS2( add WORD_REG(ax), WORD_REG(di))
899 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
900 AS2( add WORD_REG(ax), WORD_REG(di))
901 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
902 AS2( add WORD_REG(ax), WORD_REG(di))
903 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
904 AS2( add WORD_REG(ax), WORD_REG(di))
905 AS2( cmp WORD_REG(ax), 2048)
906 ATT_NOPREFIX
907 ASJ( jl, 9, b)
908 INTEL_NOPREFIX
909 AS1( lfence)
910
911 AS2( test DWORD PTR [L_LENGTH], 1)
912 ATT_NOPREFIX
913 ASJ( jz, 8, f)
914 INTEL_NOPREFIX
915
916 // counter mode one-time setup
917 AS2( mov WORD_REG(si), [L_INBLOCKS])
918 AS2( movdqu xmm2, [WORD_REG(si)]) // counter
919 AS2( pxor xmm2, xmm1)
920 AS2( psrldq xmm1, 14)
921 AS2( movd eax, xmm1)
922 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
923 AS2( MOVD MM(2), eax)
925 AS2( mov eax, 1)
926 AS2( movd mm3, eax)
927#endif
928
929 // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
930 AS2( movd eax, xmm2)
931 AS2( psrldq xmm2, 4)
932 AS2( movd edi, xmm2)
933 AS2( psrldq xmm2, 4)
934 MXOR( 1, al, 0) // 0
935 XOR( edx, ah, 1) // 1
936 AS2( shr eax, 16)
937 XOR( ecx, al, 2) // 2
938 XOR( ebx, ah, 3) // 3
939 AS2( mov eax, edi)
940 AS2( movd edi, xmm2)
941 AS2( psrldq xmm2, 4)
942 XOR( ebx, al, 0) // 4
943 MXOR( 1, ah, 1) // 5
944 AS2( shr eax, 16)
945 XOR( edx, al, 2) // 6
946 XOR( ecx, ah, 3) // 7
947 AS2( mov eax, edi)
948 AS2( movd edi, xmm2)
949 XOR( ecx, al, 0) // 8
950 XOR( ebx, ah, 1) // 9
951 AS2( shr eax, 16)
952 MXOR( 1, al, 2) // 10
953 XOR( edx, ah, 3) // 11
954 AS2( mov eax, edi)
955 XOR( edx, al, 0) // 12
956 XOR( ecx, ah, 1) // 13
957 AS2( shr eax, 16)
958 XOR( ebx, al, 2) // 14
959 AS2( psrldq xmm2, 3)
960
961 // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
962 AS2( mov eax, [L_KEY12+0*4])
963 AS2( mov edi, [L_KEY12+2*4])
964 AS2( MOVD MM(0), [L_KEY12+3*4])
965 MXOR( 0, cl, 3) /* 11 */
966 XOR( edi, bl, 3) /* 7 */
967 MXOR( 0, bh, 2) /* 6 */
968 AS2( shr ebx, 16) /* 4,5 */
969 XOR( eax, bl, 1) /* 5 */
970 MOV( ebx, bh, 0) /* 4 */
971 AS2( xor ebx, [L_KEY12+1*4])
972 XOR( eax, ch, 2) /* 10 */
973 AS2( shr ecx, 16) /* 8,9 */
974 XOR( eax, dl, 3) /* 15 */
975 XOR( ebx, dh, 2) /* 14 */
976 AS2( shr edx, 16) /* 12,13 */
977 XOR( edi, ch, 0) /* 8 */
978 XOR( ebx, cl, 1) /* 9 */
979 XOR( edi, dl, 1) /* 13 */
980 MXOR( 0, dh, 0) /* 12 */
981
982 AS2( movd ecx, xmm2)
983 AS2( MOVD edx, MM(1))
984 AS2( MOVD [L_SAVED_X+3*4], MM(0))
985 AS2( mov [L_SAVED_X+0*4], eax)
986 AS2( mov [L_SAVED_X+1*4], ebx)
987 AS2( mov [L_SAVED_X+2*4], edi)
988 ATT_NOPREFIX
989 ASJ( jmp, 5, f)
990 INTEL_NOPREFIX
991 ASL(3)
992 // non-counter mode per-block setup
993 AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
994 AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
995 AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
996 AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
997 ASL(8)
998 AS2( mov WORD_REG(ax), [L_INBLOCKS])
999 AS2( movdqu xmm2, [WORD_REG(ax)])
1000 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
1001 AS2( movdqu xmm5, [WORD_REG(si)])
1002 AS2( pxor xmm2, xmm1)
1003 AS2( pxor xmm2, xmm5)
1004
1005 // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
1006 AS2( movd eax, xmm2)
1007 AS2( psrldq xmm2, 4)
1008 AS2( movd edi, xmm2)
1009 AS2( psrldq xmm2, 4)
1010 MXOR( 1, al, 0) // 0
1011 XOR( edx, ah, 1) // 1
1012 AS2( shr eax, 16)
1013 XOR( ecx, al, 2) // 2
1014 XOR( ebx, ah, 3) // 3
1015 AS2( mov eax, edi)
1016 AS2( movd edi, xmm2)
1017 AS2( psrldq xmm2, 4)
1018 XOR( ebx, al, 0) // 4
1019 MXOR( 1, ah, 1) // 5
1020 AS2( shr eax, 16)
1021 XOR( edx, al, 2) // 6
1022 XOR( ecx, ah, 3) // 7
1023 AS2( mov eax, edi)
1024 AS2( movd edi, xmm2)
1025 XOR( ecx, al, 0) // 8
1026 XOR( ebx, ah, 1) // 9
1027 AS2( shr eax, 16)
1028 MXOR( 1, al, 2) // 10
1029 XOR( edx, ah, 3) // 11
1030 AS2( mov eax, edi)
1031 XOR( edx, al, 0) // 12
1032 XOR( ecx, ah, 1) // 13
1033 AS2( shr eax, 16)
1034 XOR( ebx, al, 2) // 14
1035 MXOR( 1, ah, 3) // 15
1036 AS2( MOVD eax, MM(1))
1037
1038 AS2( add L_REG, [L_KEYS_BEGIN])
1039 AS2( add L_REG, 4*16)
1040 ATT_NOPREFIX
1041 ASJ( jmp, 2, f)
1042 INTEL_NOPREFIX
1043 ASL(1)
1044 // counter-mode per-block setup
1045 AS2( MOVD ecx, MM(2))
1046 AS2( MOVD edx, MM(1))
1047 AS2( mov eax, [L_SAVED_X+0*4])
1048 AS2( mov ebx, [L_SAVED_X+1*4])
1049 AS2( xor cl, ch)
1050 AS2( and WORD_REG(cx), 255)
1051 ASL(5)
1053 AS2( paddb MM(2), mm3)
1054#else
1055 AS2( add MM(2), 1)
1056#endif
1057 // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
1058 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
1059 XOR( ebx, dl, 3)
1060 MOV( ecx, dh, 2)
1061 AS2( shr edx, 16)
1062 AS2( xor ecx, [L_SAVED_X+2*4])
1063 XOR( eax, dh, 0)
1064 MOV( edx, dl, 1)
1065 AS2( xor edx, [L_SAVED_X+3*4])
1066
1067 AS2( add L_REG, [L_KEYS_BEGIN])
1068 AS2( add L_REG, 3*16)
1069 ATT_NOPREFIX
1070 ASJ( jmp, 4, f)
1071 INTEL_NOPREFIX
1072
1073// in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
1074// out: eax, ebx, edi, mm0
1075#define ROUND() \
1076 MXOR( 0, cl, 3) /* 11 */\
1077 AS2( mov cl, al) /* 8,9,10,3 */\
1078 XOR( edi, ah, 2) /* 2 */\
1079 AS2( shr eax, 16) /* 0,1 */\
1080 XOR( edi, bl, 3) /* 7 */\
1081 MXOR( 0, bh, 2) /* 6 */\
1082 AS2( shr ebx, 16) /* 4,5 */\
1083 MXOR( 0, al, 1) /* 1 */\
1084 MOV( eax, ah, 0) /* 0 */\
1085 XOR( eax, bl, 1) /* 5 */\
1086 MOV( ebx, bh, 0) /* 4 */\
1087 XOR( eax, ch, 2) /* 10 */\
1088 XOR( ebx, cl, 3) /* 3 */\
1089 AS2( shr ecx, 16) /* 8,9 */\
1090 XOR( eax, dl, 3) /* 15 */\
1091 XOR( ebx, dh, 2) /* 14 */\
1092 AS2( shr edx, 16) /* 12,13 */\
1093 XOR( edi, ch, 0) /* 8 */\
1094 XOR( ebx, cl, 1) /* 9 */\
1095 XOR( edi, dl, 1) /* 13 */\
1096 MXOR( 0, dh, 0) /* 12 */\
1097
1098 ASL(2) // 2-round loop
1099 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
1100 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
1101 ROUND()
1102 AS2( mov ecx, edi)
1103 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
1104 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
1105 AS2( MOVD edx, MM(0))
1106
1107 ASL(4)
1108 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
1109 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
1110 ROUND()
1111 AS2( mov ecx, edi)
1112 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
1113 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
1114 AS2( MOVD edx, MM(0))
1115
1116 AS2( add L_REG, 32)
1117 AS2( test L_REG, 255)
1118 ATT_NOPREFIX
1119 ASJ( jnz, 2, b)
1120 INTEL_NOPREFIX
1121 AS2( sub L_REG, 16*16)
1122
1123#define LAST(a, b, c) \
1124 AS2( movzx esi, a )\
1125 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
1126 AS2( movzx esi, b )\
1127 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
1128 AS2( mov WORD PTR [L_LASTROUND+c], di )\
1129
1130 // last round
1131 LAST(ch, dl, 2)
1132 LAST(dh, al, 6)
1133 AS2( shr edx, 16)
1134 LAST(ah, bl, 10)
1135 AS2( shr eax, 16)
1136 LAST(bh, cl, 14)
1137 AS2( shr ebx, 16)
1138 LAST(dh, al, 12)
1139 AS2( shr ecx, 16)
1140 LAST(ah, bl, 0)
1141 LAST(bh, cl, 4)
1142 LAST(ch, dl, 8)
1143
1144 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
1145 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
1146
1147 AS2( mov WORD_REG(cx), [L_LENGTH])
1148 AS2( sub WORD_REG(cx), 16)
1149
1150 AS2( movdqu xmm2, [WORD_REG(ax)])
1151 AS2( pxor xmm2, xmm4)
1152
1154 AS2( movdqa xmm0, [L_INCREMENTS])
1155 AS2( paddd xmm0, [L_INBLOCKS])
1156 AS2( movdqa [L_INBLOCKS], xmm0)
1157#else
1158 AS2( movdqa xmm0, [L_INCREMENTS+16])
1159 AS2( paddq xmm0, [L_INBLOCKS+16])
1160 AS2( movdqa [L_INBLOCKS+16], xmm0)
1161#endif
1162
1163 AS2( pxor xmm2, [L_LASTROUND])
1164 AS2( movdqu [WORD_REG(bx)], xmm2)
1165
1166 ATT_NOPREFIX
1167 ASJ( jle, 7, f)
1168 INTEL_NOPREFIX
1169 AS2( mov [L_LENGTH], WORD_REG(cx))
1170 AS2( test WORD_REG(cx), 1)
1171 ATT_NOPREFIX
1172 ASJ( jnz, 1, b)
1173 INTEL_NOPREFIX
1175 AS2( movdqa xmm0, [L_INCREMENTS])
1176 AS2( paddq xmm0, [L_INBLOCKS])
1177 AS2( movdqa [L_INBLOCKS], xmm0)
1178#endif
1179 ATT_NOPREFIX
1180 ASJ( jmp, 3, b)
1181 INTEL_NOPREFIX
1182
1183 ASL(7)
1184 // erase keys on stack
1185 AS2( xorps xmm0, xmm0)
1186 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
1187 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
1188 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
1189 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
1190 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1191 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1192 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1193 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1194 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1195 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1196 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1197 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1198 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1199 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1200 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1202 AS2( mov esp, [L_SP])
1203 AS1( emms)
1204#endif
1205 AS_POP_IF86(bp)
1206 AS_POP_IF86(bx)
1207#if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1208 AS_POP_IF86(di)
1209 AS_POP_IF86(si)
1210 AS1(ret)
1211#endif
1212#ifdef CRYPTOPP_GENERATE_X64_MASM
1213 pop r12
1214 pop rbx
1215 pop rdi
1216 pop rsi
1217 ret
1218 Rijndael_Enc_AdvancedProcessBlocks ENDP
1219#endif
1220#ifdef __GNUC__
1221 ATT_PREFIX
1222 :
1223 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1224 : "memory", "cc", "%eax"
1226 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1227 #endif
1228 );
1229#endif
1230}
1231
1232#endif
1233
1234#ifndef CRYPTOPP_GENERATE_X64_MASM
1235
1236#ifdef CRYPTOPP_X64_MASM_AVAILABLE
1237extern "C" {
1238void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k);
1239}
1240#endif
1241
1242#if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1243size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1244{
1245#if CRYPTOPP_AESNI_AVAILABLE
1246 if (HasAESNI())
1247 return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1248#endif
1249#if CRYPTOPP_ARM_AES_AVAILABLE
1250 if (HasAES())
1251 return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1252#endif
1253#if CRYPTOPP_POWER8_AES_AVAILABLE
1254 if (HasAES())
1255 return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1256#endif
1257
1258#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1259 if (HasSSE2())
1260 {
1261 if (length < BLOCKSIZE)
1262 return length;
1263
1264 static const byte *zeros = (const byte*)(Te+256);
1265 m_aliasBlock.SetMark(m_aliasBlock.size());
1266 byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1267
1268 // round up to nearest 256 byte boundary
1269 space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1270 while (AliasedWithTable(space, space + sizeof(Locals)))
1271 {
1272 space += 256;
1273 CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1274 }
1275
1276 size_t increment = BLOCKSIZE;
1277 if (flags & BT_ReverseDirection)
1278 {
1279 CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1280 inBlocks += length - BLOCKSIZE;
1281 xorBlocks += length - BLOCKSIZE;
1282 outBlocks += length - BLOCKSIZE;
1283 increment = 0-increment;
1284 }
1285
1286 Locals &locals = *(Locals *)(void *)space;
1287
1288 locals.inBlocks = inBlocks;
1289 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1290 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1291 locals.outBlocks = outBlocks;
1292
1293 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1294 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1295 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1296 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1297
1298 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1299 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1300 locals.keysBegin = (12-keysToCopy)*16;
1301
1302 Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key);
1303
1304 return length % BLOCKSIZE;
1305 }
1306#endif
1307
1308 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1309}
1310
1311size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1312{
1313#if CRYPTOPP_AESNI_AVAILABLE
1314 if (HasAESNI())
1315 return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1316#endif
1317#if CRYPTOPP_ARM_AES_AVAILABLE
1318 if (HasAES())
1319 return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1320#endif
1321#if CRYPTOPP_POWER8_AES_AVAILABLE
1322 if (HasAES())
1323 return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1324#endif
1325
1326 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1327}
1328#endif // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1329
1330NAMESPACE_END
1331
1332#endif
1333#endif
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
virtual unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
Interface for retrieving values given their names.
Definition: cryptlib.h:322
Rijndael block cipher.
Definition: rijndael.h:46
Library configuration file.
#define CRYPTOPP_BOOL_X86
32-bit x86 platform
Definition: config_cpu.h:52
#define CRYPTOPP_BOOL_X64
32-bit x86 platform
Definition: config_cpu.h:48
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:91
Functions for CPU features and intrinsics.
@ BIG_ENDIAN_ORDER
byte order is big-endian
Definition: cryptlib.h:147
Utility functions for the Crypto++ library.
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:2208
void GetUserKey(ByteOrder order, T *out, size_t outlen, const byte *in, size_t inlen)
Copy bytes in a buffer to an array of elements in big-endian order.
Definition: misc.h:2291
CRYPTOPP_DLL void xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
Crypto++ library namespace.
Precompiled header file.
Classes for Rijndael encryption algorithm.
Access a block of memory.
Definition: misc.h:2844
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68