Merge pull request #511 from negativeExponent/updates
[pcsx_rearmed.git] / deps / lzma-16.04 / C / AesOpt.c
CommitLineData
ce188d4d 1/* AesOpt.c -- Intel's AES\r
22013-11-12 : Igor Pavlov : Public domain */\r
3\r
4#include "Precomp.h"\r
5\r
6#include "CpuArch.h"\r
7\r
8#ifdef MY_CPU_X86_OR_AMD64\r
9#if _MSC_VER >= 1500\r
10#define USE_INTEL_AES\r
11#endif\r
12#endif\r
13\r
14#ifdef USE_INTEL_AES\r
15\r
16#include <wmmintrin.h>\r
17\r
18void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r
19{\r
20 __m128i m = *p;\r
21 for (; numBlocks != 0; numBlocks--, data++)\r
22 {\r
23 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r
24 const __m128i *w = p + 3;\r
25 m = _mm_xor_si128(m, *data);\r
26 m = _mm_xor_si128(m, p[2]);\r
27 do\r
28 {\r
29 m = _mm_aesenc_si128(m, w[0]);\r
30 m = _mm_aesenc_si128(m, w[1]);\r
31 w += 2;\r
32 }\r
33 while (--numRounds2 != 0);\r
34 m = _mm_aesenc_si128(m, w[0]);\r
35 m = _mm_aesenclast_si128(m, w[1]);\r
36 *data = m;\r
37 }\r
38 *p = m;\r
39}\r
40\r
41#define NUM_WAYS 3\r
42\r
43#define AES_OP_W(op, n) { \\r
44 const __m128i t = w[n]; \\r
45 m0 = op(m0, t); \\r
46 m1 = op(m1, t); \\r
47 m2 = op(m2, t); \\r
48 }\r
49\r
50#define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)\r
51#define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)\r
52#define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)\r
53#define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)\r
54\r
55void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r
56{\r
57 __m128i iv = *p;\r
58 for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)\r
59 {\r
60 UInt32 numRounds2 = *(const UInt32 *)(p + 1);\r
61 const __m128i *w = p + numRounds2 * 2;\r
62 __m128i m0, m1, m2;\r
63 {\r
64 const __m128i t = w[2];\r
65 m0 = _mm_xor_si128(t, data[0]);\r
66 m1 = _mm_xor_si128(t, data[1]);\r
67 m2 = _mm_xor_si128(t, data[2]);\r
68 }\r
69 numRounds2--;\r
70 do\r
71 {\r
72 AES_DEC(1)\r
73 AES_DEC(0)\r
74 w -= 2;\r
75 }\r
76 while (--numRounds2 != 0);\r
77 AES_DEC(1)\r
78 AES_DEC_LAST(0)\r
79\r
80 {\r
81 __m128i t;\r
82 t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;\r
83 t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;\r
84 t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;\r
85 }\r
86 }\r
87 for (; numBlocks != 0; numBlocks--, data++)\r
88 {\r
89 UInt32 numRounds2 = *(const UInt32 *)(p + 1);\r
90 const __m128i *w = p + numRounds2 * 2;\r
91 __m128i m = _mm_xor_si128(w[2], *data);\r
92 numRounds2--;\r
93 do\r
94 {\r
95 m = _mm_aesdec_si128(m, w[1]);\r
96 m = _mm_aesdec_si128(m, w[0]);\r
97 w -= 2;\r
98 }\r
99 while (--numRounds2 != 0);\r
100 m = _mm_aesdec_si128(m, w[1]);\r
101 m = _mm_aesdeclast_si128(m, w[0]);\r
102\r
103 m = _mm_xor_si128(m, iv);\r
104 iv = *data;\r
105 *data = m;\r
106 }\r
107 *p = iv;\r
108}\r
109\r
110void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r
111{\r
112 __m128i ctr = *p;\r
113 __m128i one;\r
114 one.m128i_u64[0] = 1;\r
115 one.m128i_u64[1] = 0;\r
116 for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)\r
117 {\r
118 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r
119 const __m128i *w = p;\r
120 __m128i m0, m1, m2;\r
121 {\r
122 const __m128i t = w[2];\r
123 ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);\r
124 ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);\r
125 ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);\r
126 }\r
127 w += 3;\r
128 do\r
129 {\r
130 AES_ENC(0)\r
131 AES_ENC(1)\r
132 w += 2;\r
133 }\r
134 while (--numRounds2 != 0);\r
135 AES_ENC(0)\r
136 AES_ENC_LAST(1)\r
137 data[0] = _mm_xor_si128(data[0], m0);\r
138 data[1] = _mm_xor_si128(data[1], m1);\r
139 data[2] = _mm_xor_si128(data[2], m2);\r
140 }\r
141 for (; numBlocks != 0; numBlocks--, data++)\r
142 {\r
143 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r
144 const __m128i *w = p;\r
145 __m128i m;\r
146 ctr = _mm_add_epi64(ctr, one);\r
147 m = _mm_xor_si128(ctr, p[2]);\r
148 w += 3;\r
149 do\r
150 {\r
151 m = _mm_aesenc_si128(m, w[0]);\r
152 m = _mm_aesenc_si128(m, w[1]);\r
153 w += 2;\r
154 }\r
155 while (--numRounds2 != 0);\r
156 m = _mm_aesenc_si128(m, w[0]);\r
157 m = _mm_aesenclast_si128(m, w[1]);\r
158 *data = _mm_xor_si128(*data, m);\r
159 }\r
160 *p = ctr;\r
161}\r
162\r
163#else\r
164\r
165void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);\r
166void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);\r
167void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);\r
168\r
169void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r
170{\r
171 AesCbc_Encode(p, data, numBlocks);\r
172}\r
173\r
174void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r
175{\r
176 AesCbc_Decode(p, data, numBlocks);\r
177}\r
178\r
179void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r
180{\r
181 AesCtr_Code(p, data, numBlocks);\r
182}\r
183\r
184#endif\r