| 1 | /* AesOpt.c -- Intel's AES\r |
| 2 | 2013-11-12 : Igor Pavlov : Public domain */\r |
| 3 | \r |
| 4 | #include "Precomp.h"\r |
| 5 | \r |
| 6 | #include "CpuArch.h"\r |
| 7 | \r |
| 8 | #ifdef MY_CPU_X86_OR_AMD64\r |
| 9 | #if _MSC_VER >= 1500\r |
| 10 | #define USE_INTEL_AES\r |
| 11 | #endif\r |
| 12 | #endif\r |
| 13 | \r |
| 14 | #ifdef USE_INTEL_AES\r |
| 15 | \r |
| 16 | #include <wmmintrin.h>\r |
| 17 | \r |
| 18 | void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r |
| 19 | {\r |
| 20 | __m128i m = *p;\r |
| 21 | for (; numBlocks != 0; numBlocks--, data++)\r |
| 22 | {\r |
| 23 | UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r |
| 24 | const __m128i *w = p + 3;\r |
| 25 | m = _mm_xor_si128(m, *data);\r |
| 26 | m = _mm_xor_si128(m, p[2]);\r |
| 27 | do\r |
| 28 | {\r |
| 29 | m = _mm_aesenc_si128(m, w[0]);\r |
| 30 | m = _mm_aesenc_si128(m, w[1]);\r |
| 31 | w += 2;\r |
| 32 | }\r |
| 33 | while (--numRounds2 != 0);\r |
| 34 | m = _mm_aesenc_si128(m, w[0]);\r |
| 35 | m = _mm_aesenclast_si128(m, w[1]);\r |
| 36 | *data = m;\r |
| 37 | }\r |
| 38 | *p = m;\r |
| 39 | }\r |
| 40 | \r |
| 41 | #define NUM_WAYS 3\r |
| 42 | \r |
| 43 | #define AES_OP_W(op, n) { \\r |
| 44 | const __m128i t = w[n]; \\r |
| 45 | m0 = op(m0, t); \\r |
| 46 | m1 = op(m1, t); \\r |
| 47 | m2 = op(m2, t); \\r |
| 48 | }\r |
| 49 | \r |
| 50 | #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)\r |
| 51 | #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)\r |
| 52 | #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)\r |
| 53 | #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)\r |
| 54 | \r |
| 55 | void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r |
| 56 | {\r |
| 57 | __m128i iv = *p;\r |
| 58 | for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)\r |
| 59 | {\r |
| 60 | UInt32 numRounds2 = *(const UInt32 *)(p + 1);\r |
| 61 | const __m128i *w = p + numRounds2 * 2;\r |
| 62 | __m128i m0, m1, m2;\r |
| 63 | {\r |
| 64 | const __m128i t = w[2];\r |
| 65 | m0 = _mm_xor_si128(t, data[0]);\r |
| 66 | m1 = _mm_xor_si128(t, data[1]);\r |
| 67 | m2 = _mm_xor_si128(t, data[2]);\r |
| 68 | }\r |
| 69 | numRounds2--;\r |
| 70 | do\r |
| 71 | {\r |
| 72 | AES_DEC(1)\r |
| 73 | AES_DEC(0)\r |
| 74 | w -= 2;\r |
| 75 | }\r |
| 76 | while (--numRounds2 != 0);\r |
| 77 | AES_DEC(1)\r |
| 78 | AES_DEC_LAST(0)\r |
| 79 | \r |
| 80 | {\r |
| 81 | __m128i t;\r |
| 82 | t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;\r |
| 83 | t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;\r |
| 84 | t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;\r |
| 85 | }\r |
| 86 | }\r |
| 87 | for (; numBlocks != 0; numBlocks--, data++)\r |
| 88 | {\r |
| 89 | UInt32 numRounds2 = *(const UInt32 *)(p + 1);\r |
| 90 | const __m128i *w = p + numRounds2 * 2;\r |
| 91 | __m128i m = _mm_xor_si128(w[2], *data);\r |
| 92 | numRounds2--;\r |
| 93 | do\r |
| 94 | {\r |
| 95 | m = _mm_aesdec_si128(m, w[1]);\r |
| 96 | m = _mm_aesdec_si128(m, w[0]);\r |
| 97 | w -= 2;\r |
| 98 | }\r |
| 99 | while (--numRounds2 != 0);\r |
| 100 | m = _mm_aesdec_si128(m, w[1]);\r |
| 101 | m = _mm_aesdeclast_si128(m, w[0]);\r |
| 102 | \r |
| 103 | m = _mm_xor_si128(m, iv);\r |
| 104 | iv = *data;\r |
| 105 | *data = m;\r |
| 106 | }\r |
| 107 | *p = iv;\r |
| 108 | }\r |
| 109 | \r |
| 110 | void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)\r |
| 111 | {\r |
| 112 | __m128i ctr = *p;\r |
| 113 | __m128i one;\r |
| 114 | one.m128i_u64[0] = 1;\r |
| 115 | one.m128i_u64[1] = 0;\r |
| 116 | for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)\r |
| 117 | {\r |
| 118 | UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r |
| 119 | const __m128i *w = p;\r |
| 120 | __m128i m0, m1, m2;\r |
| 121 | {\r |
| 122 | const __m128i t = w[2];\r |
| 123 | ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);\r |
| 124 | ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);\r |
| 125 | ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);\r |
| 126 | }\r |
| 127 | w += 3;\r |
| 128 | do\r |
| 129 | {\r |
| 130 | AES_ENC(0)\r |
| 131 | AES_ENC(1)\r |
| 132 | w += 2;\r |
| 133 | }\r |
| 134 | while (--numRounds2 != 0);\r |
| 135 | AES_ENC(0)\r |
| 136 | AES_ENC_LAST(1)\r |
| 137 | data[0] = _mm_xor_si128(data[0], m0);\r |
| 138 | data[1] = _mm_xor_si128(data[1], m1);\r |
| 139 | data[2] = _mm_xor_si128(data[2], m2);\r |
| 140 | }\r |
| 141 | for (; numBlocks != 0; numBlocks--, data++)\r |
| 142 | {\r |
| 143 | UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;\r |
| 144 | const __m128i *w = p;\r |
| 145 | __m128i m;\r |
| 146 | ctr = _mm_add_epi64(ctr, one);\r |
| 147 | m = _mm_xor_si128(ctr, p[2]);\r |
| 148 | w += 3;\r |
| 149 | do\r |
| 150 | {\r |
| 151 | m = _mm_aesenc_si128(m, w[0]);\r |
| 152 | m = _mm_aesenc_si128(m, w[1]);\r |
| 153 | w += 2;\r |
| 154 | }\r |
| 155 | while (--numRounds2 != 0);\r |
| 156 | m = _mm_aesenc_si128(m, w[0]);\r |
| 157 | m = _mm_aesenclast_si128(m, w[1]);\r |
| 158 | *data = _mm_xor_si128(*data, m);\r |
| 159 | }\r |
| 160 | *p = ctr;\r |
| 161 | }\r |
| 162 | \r |
| 163 | #else\r |
| 164 | \r |
| 165 | void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);\r |
| 166 | void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);\r |
| 167 | void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);\r |
| 168 | \r |
| 169 | void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r |
| 170 | {\r |
| 171 | AesCbc_Encode(p, data, numBlocks);\r |
| 172 | }\r |
| 173 | \r |
| 174 | void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r |
| 175 | {\r |
| 176 | AesCbc_Decode(p, data, numBlocks);\r |
| 177 | }\r |
| 178 | \r |
| 179 | void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)\r |
| 180 | {\r |
| 181 | AesCtr_Code(p, data, numBlocks);\r |
| 182 | }\r |
| 183 | \r |
| 184 | #endif\r |