ce188d4d |
1 | /* AesOpt.c -- Intel's AES\r |
2 | 2013-11-12 : Igor Pavlov : Public domain */\r |
3 | \r |
4 | #include "Precomp.h"\r |
5 | \r |
6 | #include "CpuArch.h"\r |
7 | \r |
8 | #ifdef MY_CPU_X86_OR_AMD64\r |
9 | #if _MSC_VER >= 1500\r |
10 | #define USE_INTEL_AES\r |
11 | #endif\r |
12 | #endif\r |
13 | \r |
14 | #ifdef USE_INTEL_AES\r |
15 | \r |
16 | #include <wmmintrin.h>\r |
17 | \r |
18 | void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r |
19 | {\r |
20 | __m128i m = *p;\r |
21 | for (; numBlocks != 0; numBlocks--, data++)\r |
22 | {\r |
23 | UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r |
24 | const __m128i *w = p + 3;\r |
25 | m = _mm_xor_si128(m, *data);\r |
26 | m = _mm_xor_si128(m, p[2]);\r |
27 | do\r |
28 | {\r |
29 | m = _mm_aesenc_si128(m, w[0]);\r |
30 | m = _mm_aesenc_si128(m, w[1]);\r |
31 | w += 2;\r |
32 | }\r |
33 | while (--numRounds2 != 0);\r |
34 | m = _mm_aesenc_si128(m, w[0]);\r |
35 | m = _mm_aesenclast_si128(m, w[1]);\r |
36 | *data = m;\r |
37 | }\r |
38 | *p = m;\r |
39 | }\r |
40 | \r |
41 | #define NUM_WAYS 3\r |
42 | \r |
43 | #define AES_OP_W(op, n) { \\r |
44 | const __m128i t = w[n]; \\r |
45 | m0 = op(m0, t); \\r |
46 | m1 = op(m1, t); \\r |
47 | m2 = op(m2, t); \\r |
48 | }\r |
49 | \r |
50 | #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)\r |
51 | #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)\r |
52 | #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)\r |
53 | #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)\r |
54 | \r |
55 | void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r |
56 | {\r |
57 | __m128i iv = *p;\r |
58 | for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)\r |
59 | {\r |
60 | UInt32 numRounds2 = *(const UInt32 *)(p + 1);\r |
61 | const __m128i *w = p + numRounds2 * 2;\r |
62 | __m128i m0, m1, m2;\r |
63 | {\r |
64 | const __m128i t = w[2];\r |
65 | m0 = _mm_xor_si128(t, data[0]);\r |
66 | m1 = _mm_xor_si128(t, data[1]);\r |
67 | m2 = _mm_xor_si128(t, data[2]);\r |
68 | }\r |
69 | numRounds2--;\r |
70 | do\r |
71 | {\r |
72 | AES_DEC(1)\r |
73 | AES_DEC(0)\r |
74 | w -= 2;\r |
75 | }\r |
76 | while (--numRounds2 != 0);\r |
77 | AES_DEC(1)\r |
78 | AES_DEC_LAST(0)\r |
79 | \r |
80 | {\r |
81 | __m128i t;\r |
82 | t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;\r |
83 | t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;\r |
84 | t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;\r |
85 | }\r |
86 | }\r |
87 | for (; numBlocks != 0; numBlocks--, data++)\r |
88 | {\r |
89 | UInt32 numRounds2 = *(const UInt32 *)(p + 1);\r |
90 | const __m128i *w = p + numRounds2 * 2;\r |
91 | __m128i m = _mm_xor_si128(w[2], *data);\r |
92 | numRounds2--;\r |
93 | do\r |
94 | {\r |
95 | m = _mm_aesdec_si128(m, w[1]);\r |
96 | m = _mm_aesdec_si128(m, w[0]);\r |
97 | w -= 2;\r |
98 | }\r |
99 | while (--numRounds2 != 0);\r |
100 | m = _mm_aesdec_si128(m, w[1]);\r |
101 | m = _mm_aesdeclast_si128(m, w[0]);\r |
102 | \r |
103 | m = _mm_xor_si128(m, iv);\r |
104 | iv = *data;\r |
105 | *data = m;\r |
106 | }\r |
107 | *p = iv;\r |
108 | }\r |
109 | \r |
110 | void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r |
111 | {\r |
112 | __m128i ctr = *p;\r |
113 | __m128i one;\r |
114 | one.m128i_u64[0] = 1;\r |
115 | one.m128i_u64[1] = 0;\r |
116 | for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)\r |
117 | {\r |
118 | UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r |
119 | const __m128i *w = p;\r |
120 | __m128i m0, m1, m2;\r |
121 | {\r |
122 | const __m128i t = w[2];\r |
123 | ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);\r |
124 | ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);\r |
125 | ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);\r |
126 | }\r |
127 | w += 3;\r |
128 | do\r |
129 | {\r |
130 | AES_ENC(0)\r |
131 | AES_ENC(1)\r |
132 | w += 2;\r |
133 | }\r |
134 | while (--numRounds2 != 0);\r |
135 | AES_ENC(0)\r |
136 | AES_ENC_LAST(1)\r |
137 | data[0] = _mm_xor_si128(data[0], m0);\r |
138 | data[1] = _mm_xor_si128(data[1], m1);\r |
139 | data[2] = _mm_xor_si128(data[2], m2);\r |
140 | }\r |
141 | for (; numBlocks != 0; numBlocks--, data++)\r |
142 | {\r |
143 | UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r |
144 | const __m128i *w = p;\r |
145 | __m128i m;\r |
146 | ctr = _mm_add_epi64(ctr, one);\r |
147 | m = _mm_xor_si128(ctr, p[2]);\r |
148 | w += 3;\r |
149 | do\r |
150 | {\r |
151 | m = _mm_aesenc_si128(m, w[0]);\r |
152 | m = _mm_aesenc_si128(m, w[1]);\r |
153 | w += 2;\r |
154 | }\r |
155 | while (--numRounds2 != 0);\r |
156 | m = _mm_aesenc_si128(m, w[0]);\r |
157 | m = _mm_aesenclast_si128(m, w[1]);\r |
158 | *data = _mm_xor_si128(*data, m);\r |
159 | }\r |
160 | *p = ctr;\r |
161 | }\r |
162 | \r |
163 | #else\r |
164 | \r |
165 | void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);\r |
166 | void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);\r |
167 | void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);\r |
168 | \r |
169 | void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r |
170 | {\r |
171 | AesCbc_Encode(p, data, numBlocks);\r |
172 | }\r |
173 | \r |
174 | void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r |
175 | {\r |
176 | AesCbc_Decode(p, data, numBlocks);\r |
177 | }\r |
178 | \r |
179 | void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r |
180 | {\r |
181 | AesCtr_Code(p, data, numBlocks);\r |
182 | }\r |
183 | \r |
184 | #endif\r |