Crypto++ 8.7
Free C++ class library of cryptographic schemes
arm_simd.h
Go to the documentation of this file.
1// arm_simd.h - written and placed in public domain by Jeffrey Walton
2
3/// \file arm_simd.h
4/// \brief Support functions for ARM and vector operations
5
6#ifndef CRYPTOPP_ARM_SIMD_H
7#define CRYPTOPP_ARM_SIMD_H
8
9#include "config.h"
10
11#if (CRYPTOPP_ARM_NEON_HEADER)
12# include <stdint.h>
13# include <arm_neon.h>
14#endif
15
16#if (CRYPTOPP_ARM_ACLE_HEADER)
17# include <stdint.h>
18# include <arm_acle.h>
19#endif
20
21#if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
22/// \name CRC32 checksum
23//@{
24
25/// \brief CRC32 checksum
26/// \param crc the starting crc value
27/// \param val the value to checksum
28/// \return CRC32 value
29/// \since Crypto++ 8.6
30inline uint32_t CRC32B (uint32_t crc, uint8_t val)
31{
32#if defined(_MSC_VER)
33 return __crc32b(crc, val);
34#else
35 __asm__ ("crc32b %w0, %w0, %w1 \n\t"
36 :"+r" (crc) : "r" (val) );
37 return crc;
38#endif
39}
40
41/// \brief CRC32 checksum
42/// \param crc the starting crc value
43/// \param val the value to checksum
44/// \return CRC32 value
45/// \since Crypto++ 8.6
46inline uint32_t CRC32W (uint32_t crc, uint32_t val)
47{
48#if defined(_MSC_VER)
49 return __crc32w(crc, val);
50#else
51 __asm__ ("crc32w %w0, %w0, %w1 \n\t"
52 :"+r" (crc) : "r" (val) );
53 return crc;
54#endif
55}
56
57/// \brief CRC32 checksum
58/// \param crc the starting crc value
59/// \param vals the values to checksum
60/// \return CRC32 value
61/// \since Crypto++ 8.6
62inline uint32_t CRC32Wx4 (uint32_t crc, const uint32_t vals[4])
63{
64#if defined(_MSC_VER)
65 return __crc32w(__crc32w(__crc32w(__crc32w(
66 crc, vals[0]), vals[1]), vals[2]), vals[3]);
67#else
68 __asm__ ("crc32w %w0, %w0, %w1 \n\t"
69 "crc32w %w0, %w0, %w2 \n\t"
70 "crc32w %w0, %w0, %w3 \n\t"
71 "crc32w %w0, %w0, %w4 \n\t"
72 :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
73 "r" (vals[2]), "r" (vals[3]));
74 return crc;
75#endif
76}
77
78//@}
79/// \name CRC32-C checksum
80
81/// \brief CRC32-C checksum
82/// \param crc the starting crc value
83/// \param val the value to checksum
84/// \return CRC32-C value
85/// \since Crypto++ 8.6
86inline uint32_t CRC32CB (uint32_t crc, uint8_t val)
87{
88#if defined(_MSC_VER)
89 return __crc32cb(crc, val);
90#else
91 __asm__ ("crc32cb %w0, %w0, %w1 \n\t"
92 :"+r" (crc) : "r" (val) );
93 return crc;
94#endif
95}
96
97/// \brief CRC32-C checksum
98/// \param crc the starting crc value
99/// \param val the value to checksum
100/// \return CRC32-C value
101/// \since Crypto++ 8.6
102inline uint32_t CRC32CW (uint32_t crc, uint32_t val)
103{
104#if defined(_MSC_VER)
105 return __crc32cw(crc, val);
106#else
107 __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
108 :"+r" (crc) : "r" (val) );
109 return crc;
110#endif
111}
112
113/// \brief CRC32-C checksum
114/// \param crc the starting crc value
115/// \param vals the values to checksum
116/// \return CRC32-C value
117/// \since Crypto++ 8.6
118inline uint32_t CRC32CWx4 (uint32_t crc, const uint32_t vals[4])
119{
120#if defined(_MSC_VER)
121 return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
122 crc, vals[0]), vals[1]), vals[2]), vals[3]);
123#else
124 __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
125 "crc32cw %w0, %w0, %w2 \n\t"
126 "crc32cw %w0, %w0, %w3 \n\t"
127 "crc32cw %w0, %w0, %w4 \n\t"
128 :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
129 "r" (vals[2]), "r" (vals[3]));
130 return crc;
131#endif
132}
133//@}
134#endif // CRYPTOPP_ARM_CRC32_AVAILABLE
135
136#if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
137/// \name Polynomial multiplication
138//@{
139
140/// \brief Polynomial multiplication
141/// \param a the first value
142/// \param b the second value
143/// \return vector product
144/// \details PMULL_00() performs polynomial multiplication and presents
145/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
146/// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
147/// are multiplied.
148/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
149/// is MSB and numbered 127, while the rightmost bit is LSB and
150/// numbered 0.
151/// \since Crypto++ 8.0
152inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
153{
154#if defined(_MSC_VER)
155 const __n64 x = { vgetq_lane_u64(a, 0) };
156 const __n64 y = { vgetq_lane_u64(b, 0) };
157 return vmull_p64(x, y);
158#elif defined(__GNUC__)
159 uint64x2_t r;
160 __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
161 :"=w" (r) : "w" (a), "w" (b) );
162 return r;
163#else
164 return (uint64x2_t)(vmull_p64(
165 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
166 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
167#endif
168}
169
170/// \brief Polynomial multiplication
171/// \param a the first value
172/// \param b the second value
173/// \return vector product
174/// \details PMULL_01 performs() polynomial multiplication and presents
175/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
176/// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
177/// 64-bits of <tt>b</tt> are multiplied.
178/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
179/// is MSB and numbered 127, while the rightmost bit is LSB and
180/// numbered 0.
181/// \since Crypto++ 8.0
182inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
183{
184#if defined(_MSC_VER)
185 const __n64 x = { vgetq_lane_u64(a, 0) };
186 const __n64 y = { vgetq_lane_u64(b, 1) };
187 return vmull_p64(x, y);
188#elif defined(__GNUC__)
189 uint64x2_t r;
190 __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
191 :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
192 return r;
193#else
194 return (uint64x2_t)(vmull_p64(
195 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
196 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
197#endif
198}
199
200/// \brief Polynomial multiplication
201/// \param a the first value
202/// \param b the second value
203/// \return vector product
204/// \details PMULL_10() performs polynomial multiplication and presents
205/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
206/// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
207/// 64-bits of <tt>b</tt> are multiplied.
208/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
209/// is MSB and numbered 127, while the rightmost bit is LSB and
210/// numbered 0.
211/// \since Crypto++ 8.0
212inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
213{
214#if defined(_MSC_VER)
215 const __n64 x = { vgetq_lane_u64(a, 1) };
216 const __n64 y = { vgetq_lane_u64(b, 0) };
217 return vmull_p64(x, y);
218#elif defined(__GNUC__)
219 uint64x2_t r;
220 __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
221 :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
222 return r;
223#else
224 return (uint64x2_t)(vmull_p64(
225 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
226 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
227#endif
228}
229
230/// \brief Polynomial multiplication
231/// \param a the first value
232/// \param b the second value
233/// \return vector product
234/// \details PMULL_11() performs polynomial multiplication and presents
235/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
236/// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
237/// are multiplied.
238/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
239/// is MSB and numbered 127, while the rightmost bit is LSB and
240/// numbered 0.
241/// \since Crypto++ 8.0
242inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
243{
244#if defined(_MSC_VER)
245 const __n64 x = { vgetq_lane_u64(a, 1) };
246 const __n64 y = { vgetq_lane_u64(b, 1) };
247 return vmull_p64(x, y);
248#elif defined(__GNUC__)
249 uint64x2_t r;
250 __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
251 :"=w" (r) : "w" (a), "w" (b) );
252 return r;
253#else
254 return (uint64x2_t)(vmull_p64(
255 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
256 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
257#endif
258}
259
260/// \brief Polynomial multiplication
261/// \param a the first value
262/// \param b the second value
263/// \return vector product
264/// \details PMULL() performs vmull_p64(). PMULL is provided as
265/// GCC inline assembly due to Clang and lack of support for the intrinsic.
266/// \since Crypto++ 8.0
267inline uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
268{
269#if defined(_MSC_VER)
270 const __n64 x = { vgetq_lane_u64(a, 0) };
271 const __n64 y = { vgetq_lane_u64(b, 0) };
272 return vmull_p64(x, y);
273#elif defined(__GNUC__)
274 uint64x2_t r;
275 __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
276 :"=w" (r) : "w" (a), "w" (b) );
277 return r;
278#else
279 return (uint64x2_t)(vmull_p64(
280 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
281 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
282#endif
283}
284
285/// \brief Polynomial multiplication
286/// \param a the first value
287/// \param b the second value
288/// \return vector product
289/// \details PMULL_HIGH() performs vmull_high_p64(). PMULL_HIGH is provided as
290/// GCC inline assembly due to Clang and lack of support for the intrinsic.
291/// \since Crypto++ 8.0
292inline uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
293{
294#if defined(_MSC_VER)
295 const __n64 x = { vgetq_lane_u64(a, 1) };
296 const __n64 y = { vgetq_lane_u64(b, 1) };
297 return vmull_p64(x, y);
298#elif defined(__GNUC__)
299 uint64x2_t r;
300 __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
301 :"=w" (r) : "w" (a), "w" (b) );
302 return r;
303#else
304 return (uint64x2_t)(vmull_p64(
305 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
306 vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
307#endif
308}
309
310/// \brief Vector extraction
311/// \param a the first value
312/// \param b the second value
313/// \param c the byte count
314/// \return vector
315/// \details VEXT_U8() extracts the first <tt>c</tt> bytes of vector
316/// <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided
317/// as GCC inline assembly due to Clang and lack of support for the intrinsic.
318/// \since Crypto++ 8.0
319inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
320{
321#if defined(_MSC_VER)
322 return vreinterpretq_u64_u8(vextq_u8(
323 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c));
324#else
325 uint64x2_t r;
326 __asm__ ("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
327 :"=w" (r) : "w" (a), "w" (b), "I" (c) );
328 return r;
329#endif
330}
331
332/// \brief Vector extraction
333/// \tparam C the byte count
334/// \param a the first value
335/// \param b the second value
336/// \return vector
337/// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
338/// <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided
339/// as GCC inline assembly due to Clang and lack of support for the intrinsic.
340/// \since Crypto++ 8.0
341template <unsigned int C>
342inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
343{
344 // https://github.com/weidai11/cryptopp/issues/366
345#if defined(_MSC_VER)
346 return vreinterpretq_u64_u8(vextq_u8(
347 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
348#else
349 uint64x2_t r;
350 __asm__ ("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
351 :"=w" (r) : "w" (a), "w" (b), "I" (C) );
352 return r;
353#endif
354}
355
356//@}
357#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
358
359#if CRYPTOPP_ARM_SHA3_AVAILABLE || defined(CRYPTOPP_DOXYGEN_PROCESSING)
360/// \name ARMv8.2 operations
361//@{
362
363/// \brief Three-way XOR
364/// \param a the first value
365/// \param b the second value
366/// \param c the third value
367/// \return three-way exclusive OR of the values
368/// \details VEOR3() performs veor3q_u64(). VEOR3 is provided as GCC inline assembly due
369/// to Clang and lack of support for the intrinsic.
370/// \details VEOR3 requires ARMv8.2.
371/// \since Crypto++ 8.6
372inline uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
373{
374#if defined(_MSC_VER)
375 return veor3q_u64(a, b, c);
376#else
377 uint64x2_t r;
378 __asm__ ("eor3 %0.16b, %1.16b, %2.16b, %3.16b \n\t"
379 :"=w" (r) : "w" (a), "w" (b), "w" (c));
380 return r;
381#endif
382}
383
384/// \brief XOR and rotate
385/// \param a the first value
386/// \param b the second value
387/// \param c the third value
388/// \return two-way exclusive OR of the values, then rotated by c
389/// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
390/// to Clang and lack of support for the intrinsic.
391/// \details VXARQ requires ARMv8.2.
392/// \since Crypto++ 8.6
393inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
394{
395#if defined(_MSC_VER)
396 return vxarq_u64(a, b, c);
397#else
398 uint64x2_t r;
399 __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
400 :"=w" (r) : "w" (a), "w" (b), "I" (c));
401 return r;
402#endif
403}
404
405/// \brief XOR and rotate
406/// \tparam C the rotate amount
407/// \param a the first value
408/// \param b the second value
409/// \return two-way exclusive OR of the values, then rotated by C
410/// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
411/// to Clang and lack of support for the intrinsic.
412/// \details VXARQ requires ARMv8.2.
413/// \since Crypto++ 8.6
414template <unsigned int C>
415inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b)
416{
417#if defined(_MSC_VER)
418 return vxarq_u64(a, b, C);
419#else
420 uint64x2_t r;
421 __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
422 :"=w" (r) : "w" (a), "w" (b), "I" (C));
423 return r;
424#endif
425}
426
427/// \brief XOR and rotate
428/// \param a the first value
429/// \param b the second value
430/// \return two-way exclusive OR of the values, then rotated 1-bit
431/// \details VRAX1() performs vrax1q_u64(). VRAX1 is provided as GCC inline assembly due
432/// to Clang and lack of support for the intrinsic.
433/// \details VRAX1 requires ARMv8.2.
434/// \since Crypto++ 8.6
435inline uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
436{
437#if defined(_MSC_VER)
438 return vrax1q_u64(a, b);
439#else
440 uint64x2_t r;
441 __asm__ ("rax1 %0.2d, %1.2d, %2.2d \n\t"
442 :"=w" (r) : "w" (a), "w" (b));
443 return r;
444#endif
445}
446//@}
447#endif // CRYPTOPP_ARM_SHA3_AVAILABLE
448
449#endif // CRYPTOPP_ARM_SIMD_H
uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
XOR and rotate.
Definition: arm_simd.h:393
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:152
uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
XOR and rotate.
Definition: arm_simd.h:435
uint32_t CRC32CWx4(uint32_t crc, const uint32_t vals[4])
CRC32-C checksum.
Definition: arm_simd.h:118
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:242
uint32_t CRC32CB(uint32_t crc, uint8_t val)
CRC32-C checksum.
Definition: arm_simd.h:86
uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:292
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:182
uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
Three-way XOR.
Definition: arm_simd.h:372
uint32_t CRC32W(uint32_t crc, uint32_t val)
CRC32 checksum.
Definition: arm_simd.h:46
uint32_t CRC32B(uint32_t crc, uint8_t val)
CRC32 checksum.
Definition: arm_simd.h:30
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:212
uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:267
uint32_t CRC32CW(uint32_t crc, uint32_t val)
CRC32-C checksum.
Definition: arm_simd.h:102
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
Definition: arm_simd.h:319
uint32_t CRC32Wx4(uint32_t crc, const uint32_t vals[4])
CRC32 checksum.
Definition: arm_simd.h:62
Library configuration file.