ce188d4d |
1 | /* libFLAC - Free Lossless Audio Codec library |
2 | * Copyright (C) 2000-2009 Josh Coalson |
3 | * Copyright (C) 2011-2016 Xiph.Org Foundation |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * |
9 | * - Redistributions of source code must retain the above copyright |
10 | * notice, this list of conditions and the following disclaimer. |
11 | * |
12 | * - Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * - Neither the name of the Xiph.org Foundation nor the names of its |
17 | * contributors may be used to endorse or promote products derived from |
18 | * this software without specific prior written permission. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
23 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
24 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
25 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
26 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
27 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
28 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
29 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
30 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
31 | */ |
32 | |
33 | #ifdef HAVE_CONFIG_H |
34 | # include <config.h> |
35 | #endif |
36 | |
37 | #include "private/cpu.h" |
38 | |
39 | #ifndef FLAC__INTEGER_ONLY_LIBRARY |
40 | #ifndef FLAC__NO_ASM |
41 | #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN |
42 | #include "private/lpc.h" |
43 | #ifdef FLAC__SSE4_1_SUPPORTED |
44 | |
45 | #include "FLAC/assert.h" |
46 | #include "FLAC/format.h" |
47 | |
48 | #include <smmintrin.h> /* SSE4.1 */ |
49 | |
50 | #if defined FLAC__CPU_IA32 /* unused for x64 */ |
51 | |
52 | #define RESIDUAL64_RESULT(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt)) |
53 | #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization)) |
54 | |
55 | FLAC__SSE_TARGET("sse4.1") |
56 | void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
57 | { |
58 | int i; |
59 | __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
60 | |
61 | FLAC__ASSERT(order > 0); |
62 | FLAC__ASSERT(order <= 32); |
63 | FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */ |
64 | |
65 | if(order <= 12) { |
66 | if(order > 8) { /* order == 9, 10, 11, 12 */ |
67 | if(order > 10) { /* order == 11, 12 */ |
68 | if(order == 12) { |
69 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; |
70 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0] |
71 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2] |
72 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4] |
73 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6] |
74 | xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8] |
75 | xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10] |
76 | |
77 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0] |
78 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2] |
79 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4] |
80 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6] |
81 | xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8] |
82 | xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10] |
83 | |
84 | for(i = 0; i < (int)data_len; i++) { |
85 | //sum = 0; |
86 | //sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; |
87 | //sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; |
88 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12] |
89 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] |
90 | xmm7 = _mm_mul_epi32(xmm7, xmm5); |
91 | |
92 | //sum += qlp_coeff[9] * (FLAC__int64)data[i-10]; |
93 | //sum += qlp_coeff[8] * (FLAC__int64)data[i-9]; |
94 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); |
95 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
96 | xmm6 = _mm_mul_epi32(xmm6, xmm4); |
97 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
98 | |
99 | //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; |
100 | //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; |
101 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
102 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
103 | xmm6 = _mm_mul_epi32(xmm6, xmm3); |
104 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
105 | |
106 | //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; |
107 | //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; |
108 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
109 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
110 | xmm6 = _mm_mul_epi32(xmm6, xmm2); |
111 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
112 | |
113 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
114 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
115 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
116 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
117 | xmm6 = _mm_mul_epi32(xmm6, xmm1); |
118 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
119 | |
120 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
121 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
122 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
123 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
124 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
125 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
126 | |
127 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
128 | RESIDUAL64_RESULT1(xmm7); |
129 | } |
130 | } |
131 | else { /* order == 11 */ |
132 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; |
133 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
134 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
135 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
136 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
137 | xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); |
138 | xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]); |
139 | |
140 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
141 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
142 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
143 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); |
144 | xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); |
145 | |
146 | for(i = 0; i < (int)data_len; i++) { |
147 | //sum = 0; |
148 | //sum = qlp_coeff[10] * (FLAC__int64)data[i-11]; |
149 | xmm7 = _mm_cvtsi32_si128(data[i-11]); |
150 | xmm7 = _mm_mul_epi32(xmm7, xmm5); |
151 | |
152 | //sum += qlp_coeff[9] * (FLAC__int64)data[i-10]; |
153 | //sum += qlp_coeff[8] * (FLAC__int64)data[i-9]; |
154 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); |
155 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
156 | xmm6 = _mm_mul_epi32(xmm6, xmm4); |
157 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
158 | |
159 | //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; |
160 | //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; |
161 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
162 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
163 | xmm6 = _mm_mul_epi32(xmm6, xmm3); |
164 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
165 | |
166 | //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; |
167 | //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; |
168 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
169 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
170 | xmm6 = _mm_mul_epi32(xmm6, xmm2); |
171 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
172 | |
173 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
174 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
175 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
176 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
177 | xmm6 = _mm_mul_epi32(xmm6, xmm1); |
178 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
179 | |
180 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
181 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
182 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
183 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
184 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
185 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
186 | |
187 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
188 | RESIDUAL64_RESULT1(xmm7); |
189 | } |
190 | } |
191 | } |
192 | else { /* order == 9, 10 */ |
193 | if(order == 10) { |
194 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; |
195 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
196 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
197 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
198 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
199 | xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); |
200 | |
201 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
202 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
203 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
204 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); |
205 | xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); |
206 | |
207 | for(i = 0; i < (int)data_len; i++) { |
208 | //sum = 0; |
209 | //sum += qlp_coeff[9] * (FLAC__int64)data[i-10]; |
210 | //sum += qlp_coeff[8] * (FLAC__int64)data[i-9]; |
211 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10)); |
212 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
213 | xmm7 = _mm_mul_epi32(xmm7, xmm4); |
214 | |
215 | //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; |
216 | //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; |
217 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
218 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
219 | xmm6 = _mm_mul_epi32(xmm6, xmm3); |
220 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
221 | |
222 | //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; |
223 | //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; |
224 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
225 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
226 | xmm6 = _mm_mul_epi32(xmm6, xmm2); |
227 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
228 | |
229 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
230 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
231 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
232 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
233 | xmm6 = _mm_mul_epi32(xmm6, xmm1); |
234 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
235 | |
236 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
237 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
238 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
239 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
240 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
241 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
242 | |
243 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
244 | RESIDUAL64_RESULT(xmm7); |
245 | } |
246 | } |
247 | else { /* order == 9 */ |
248 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; |
249 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
250 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
251 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
252 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
253 | xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]); |
254 | |
255 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
256 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
257 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
258 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); |
259 | |
260 | for(i = 0; i < (int)data_len; i++) { |
261 | //sum = 0; |
262 | //sum = qlp_coeff[8] * (FLAC__int64)data[i-9]; |
263 | xmm7 = _mm_cvtsi32_si128(data[i-9]); |
264 | xmm7 = _mm_mul_epi32(xmm7, xmm4); |
265 | |
266 | //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; |
267 | //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; |
268 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
269 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
270 | xmm6 = _mm_mul_epi32(xmm6, xmm3); |
271 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
272 | |
273 | //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; |
274 | //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; |
275 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
276 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
277 | xmm6 = _mm_mul_epi32(xmm6, xmm2); |
278 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
279 | |
280 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
281 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
282 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
283 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
284 | xmm6 = _mm_mul_epi32(xmm6, xmm1); |
285 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
286 | |
287 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
288 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
289 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
290 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
291 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
292 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
293 | |
294 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
295 | RESIDUAL64_RESULT(xmm7); |
296 | } |
297 | } |
298 | } |
299 | } |
300 | else if(order > 4) { /* order == 5, 6, 7, 8 */ |
301 | if(order > 6) { /* order == 7, 8 */ |
302 | if(order == 8) { |
303 | __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; |
304 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
305 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
306 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
307 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
308 | |
309 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
310 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
311 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
312 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); |
313 | |
314 | for(i = 0; i < (int)data_len; i++) { |
315 | //sum = 0; |
316 | //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; |
317 | //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; |
318 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
319 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
320 | xmm7 = _mm_mul_epi32(xmm7, xmm3); |
321 | |
322 | //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; |
323 | //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; |
324 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
325 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
326 | xmm6 = _mm_mul_epi32(xmm6, xmm2); |
327 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
328 | |
329 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
330 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
331 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
332 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
333 | xmm6 = _mm_mul_epi32(xmm6, xmm1); |
334 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
335 | |
336 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
337 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
338 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
339 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
340 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
341 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
342 | |
343 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
344 | RESIDUAL64_RESULT(xmm7); |
345 | } |
346 | } |
347 | else { /* order == 7 */ |
348 | __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; |
349 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
350 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
351 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
352 | xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]); |
353 | |
354 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
355 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
356 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
357 | |
358 | for(i = 0; i < (int)data_len; i++) { |
359 | //sum = 0; |
360 | //sum = qlp_coeff[6] * (FLAC__int64)data[i-7]; |
361 | xmm7 = _mm_cvtsi32_si128(data[i-7]); |
362 | xmm7 = _mm_mul_epi32(xmm7, xmm3); |
363 | |
364 | //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; |
365 | //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; |
366 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
367 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
368 | xmm6 = _mm_mul_epi32(xmm6, xmm2); |
369 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
370 | |
371 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
372 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
373 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
374 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
375 | xmm6 = _mm_mul_epi32(xmm6, xmm1); |
376 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
377 | |
378 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
379 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
380 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
381 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
382 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
383 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
384 | |
385 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
386 | RESIDUAL64_RESULT(xmm7); |
387 | } |
388 | } |
389 | } |
390 | else { /* order == 5, 6 */ |
391 | if(order == 6) { |
392 | __m128i xmm0, xmm1, xmm2, xmm6, xmm7; |
393 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
394 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
395 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
396 | |
397 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
398 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
399 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
400 | |
401 | for(i = 0; i < (int)data_len; i++) { |
402 | //sum = 0; |
403 | //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; |
404 | //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; |
405 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
406 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
407 | xmm7 = _mm_mul_epi32(xmm7, xmm2); |
408 | |
409 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
410 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
411 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
412 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
413 | xmm6 = _mm_mul_epi32(xmm6, xmm1); |
414 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
415 | |
416 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
417 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
418 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
419 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
420 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
421 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
422 | |
423 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
424 | RESIDUAL64_RESULT(xmm7); |
425 | } |
426 | } |
427 | else { /* order == 5 */ |
428 | __m128i xmm0, xmm1, xmm2, xmm6, xmm7; |
429 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
430 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
431 | xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]); |
432 | |
433 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
434 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
435 | |
436 | for(i = 0; i < (int)data_len; i++) { |
437 | //sum = 0; |
438 | //sum = qlp_coeff[4] * (FLAC__int64)data[i-5]; |
439 | xmm7 = _mm_cvtsi32_si128(data[i-5]); |
440 | xmm7 = _mm_mul_epi32(xmm7, xmm2); |
441 | |
442 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
443 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
444 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
445 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
446 | xmm6 = _mm_mul_epi32(xmm6, xmm1); |
447 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
448 | |
449 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
450 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
451 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
452 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
453 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
454 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
455 | |
456 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
457 | RESIDUAL64_RESULT(xmm7); |
458 | } |
459 | } |
460 | } |
461 | } |
462 | else { /* order == 1, 2, 3, 4 */ |
463 | if(order > 2) { /* order == 3, 4 */ |
464 | if(order == 4) { |
465 | __m128i xmm0, xmm1, xmm6, xmm7; |
466 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
467 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
468 | |
469 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
470 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
471 | |
472 | for(i = 0; i < (int)data_len; i++) { |
473 | //sum = 0; |
474 | //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; |
475 | //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; |
476 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
477 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
478 | xmm7 = _mm_mul_epi32(xmm7, xmm1); |
479 | |
480 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
481 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
482 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
483 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
484 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
485 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
486 | |
487 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
488 | RESIDUAL64_RESULT(xmm7); |
489 | } |
490 | } |
491 | else { /* order == 3 */ |
492 | __m128i xmm0, xmm1, xmm6, xmm7; |
493 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
494 | xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]); |
495 | |
496 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
497 | |
498 | for(i = 0; i < (int)data_len; i++) { |
499 | //sum = 0; |
500 | //sum = qlp_coeff[2] * (FLAC__int64)data[i-3]; |
501 | xmm7 = _mm_cvtsi32_si128(data[i-3]); |
502 | xmm7 = _mm_mul_epi32(xmm7, xmm1); |
503 | |
504 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
505 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
506 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
507 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
508 | xmm6 = _mm_mul_epi32(xmm6, xmm0); |
509 | xmm7 = _mm_add_epi64(xmm7, xmm6); |
510 | |
511 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
512 | RESIDUAL64_RESULT(xmm7); |
513 | } |
514 | } |
515 | } |
516 | else { /* order == 1, 2 */ |
517 | if(order == 2) { |
518 | __m128i xmm0, xmm7; |
519 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
520 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
521 | |
522 | for(i = 0; i < (int)data_len; i++) { |
523 | //sum = 0; |
524 | //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; |
525 | //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; |
526 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
527 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
528 | xmm7 = _mm_mul_epi32(xmm7, xmm0); |
529 | |
530 | xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); |
531 | RESIDUAL64_RESULT(xmm7); |
532 | } |
533 | } |
534 | else { /* order == 1 */ |
535 | __m128i xmm0, xmm7; |
536 | xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]); |
537 | |
538 | for(i = 0; i < (int)data_len; i++) { |
539 | //sum = qlp_coeff[0] * (FLAC__int64)data[i-1]; |
540 | xmm7 = _mm_cvtsi32_si128(data[i-1]); |
541 | xmm7 = _mm_mul_epi32(xmm7, xmm0); |
542 | RESIDUAL64_RESULT(xmm7); |
543 | } |
544 | } |
545 | } |
546 | } |
547 | } |
548 | else { /* order > 12 */ |
549 | FLAC__int64 sum; |
550 | for(i = 0; i < (int)data_len; i++) { |
551 | sum = 0; |
552 | switch(order) { |
553 | case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; |
554 | case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; |
555 | case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; |
556 | case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; |
557 | case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; |
558 | case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; |
559 | case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; |
560 | case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; |
561 | case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; |
562 | case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; |
563 | case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; |
564 | case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; |
565 | case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; |
566 | case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; |
567 | case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; |
568 | case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; |
569 | case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; |
570 | case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; |
571 | case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; |
572 | case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13]; |
573 | sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; |
574 | sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; |
575 | sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; |
576 | sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; |
577 | sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; |
578 | sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; |
579 | sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; |
580 | sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; |
581 | sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; |
582 | sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; |
583 | sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; |
584 | sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; |
585 | } |
586 | residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); |
587 | } |
588 | } |
589 | } |
590 | |
591 | FLAC__SSE_TARGET("sse4.1") |
592 | void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) |
593 | { |
594 | int i; |
595 | __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
596 | |
597 | if (!data_len) |
598 | return; |
599 | |
600 | FLAC__ASSERT(order > 0); |
601 | FLAC__ASSERT(order <= 32); |
602 | FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */ |
603 | |
604 | if(order <= 12) { |
605 | if(order > 8) { /* order == 9, 10, 11, 12 */ |
606 | if(order > 10) { /* order == 11, 12 */ |
607 | __m128i qlp[6], dat[6]; |
608 | __m128i summ, temp; |
609 | qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0] |
610 | qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2] |
611 | qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4] |
612 | qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6] |
613 | qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8] |
614 | if (order == 12) |
615 | qlp[5] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10] |
616 | else |
617 | qlp[5] = _mm_cvtsi32_si128(qlp_coeff[10]); // 0 0 0 q[10] |
618 | |
619 | qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); // 0 q[0] 0 q[1] |
620 | qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); // 0 q[2] 0 q[3] |
621 | qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1)); // 0 q[4] 0 q[5] |
622 | qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1)); // 0 q[5] 0 q[7] |
623 | qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1)); // 0 q[8] 0 q[9] |
624 | qlp[5] = _mm_shuffle_epi32(qlp[5], _MM_SHUFFLE(2,0,3,1)); // 0 q[10] 0 q[11] |
625 | |
626 | dat[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-12))); // ? d[i-11] ? d[i-12] |
627 | dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10))); // ? d[i-9] ? d[i-10] |
628 | dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 ))); // ? d[i-7] ? d[i-8] |
629 | dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 ))); // ? d[i-5] ? d[i-6] |
630 | dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); // ? d[i-3] ? d[i-4] |
631 | dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); // ? d[i-1] ? d[i-2] |
632 | |
633 | summ = _mm_mul_epi32(dat[5], qlp[5]) ; |
634 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4])); |
635 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); |
636 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); |
637 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); |
638 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
639 | |
640 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64 |
641 | summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32 |
642 | temp = _mm_cvtsi32_si128(residual[0]); // 0 0 0 r[i] |
643 | temp = _mm_add_epi32(temp, summ); // ? ? ? d[i] |
644 | data[0] = _mm_cvtsi128_si32(temp); |
645 | |
646 | for(i = 1; i < (int)data_len; i++) { |
647 | dat[5] = _mm_alignr_epi8(dat[4], dat[5], 8); // ? d[i-10] ? d[i-11] |
648 | dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8); // ? d[i-8] ? d[i-9] |
649 | dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8); // ? d[i-6] ? d[i-7] |
650 | dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); // ? d[i-4] ? d[i-5] |
651 | dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); // ? d[i-2] ? d[i-3] |
652 | dat[0] = _mm_alignr_epi8(temp, dat[0], 8); // ? d[i ] ? d[i-1] |
653 | |
654 | summ = _mm_mul_epi32(dat[5], qlp[5]) ; |
655 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4])); |
656 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); |
657 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); |
658 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); |
659 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
660 | |
661 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64 |
662 | summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32 |
663 | temp = _mm_cvtsi32_si128(residual[i]); // 0 0 0 r[i] |
664 | temp = _mm_add_epi32(temp, summ); // ? ? ? d[i] |
665 | data[i] = _mm_cvtsi128_si32(temp); |
666 | } |
667 | } |
668 | else { /* order == 9, 10 */ |
669 | __m128i qlp[5], dat[5]; |
670 | __m128i summ, temp; |
671 | qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
672 | qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
673 | qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
674 | qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
675 | if (order == 10) |
676 | qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); |
677 | else |
678 | qlp[4] = _mm_cvtsi32_si128(qlp_coeff[8]); |
679 | |
680 | qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); |
681 | qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); |
682 | qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1)); |
683 | qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1)); |
684 | qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1)); |
685 | |
686 | dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10))); |
687 | dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 ))); |
688 | dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 ))); |
689 | dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); |
690 | dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); |
691 | |
692 | summ = _mm_mul_epi32(dat[4], qlp[4]) ; |
693 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); |
694 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); |
695 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); |
696 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
697 | |
698 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
699 | summ = _mm_srl_epi64(summ, cnt); |
700 | temp = _mm_cvtsi32_si128(residual[0]); |
701 | temp = _mm_add_epi32(temp, summ); |
702 | data[0] = _mm_cvtsi128_si32(temp); |
703 | |
704 | for(i = 1; i < (int)data_len; i++) { |
705 | dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8); |
706 | dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8); |
707 | dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); |
708 | dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); |
709 | dat[0] = _mm_alignr_epi8(temp, dat[0], 8); |
710 | |
711 | summ = _mm_mul_epi32(dat[4], qlp[4]) ; |
712 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); |
713 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); |
714 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); |
715 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
716 | |
717 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
718 | summ = _mm_srl_epi64(summ, cnt); |
719 | temp = _mm_cvtsi32_si128(residual[i]); |
720 | temp = _mm_add_epi32(temp, summ); |
721 | data[i] = _mm_cvtsi128_si32(temp); |
722 | } |
723 | } |
724 | } |
725 | else if(order > 4) { /* order == 5, 6, 7, 8 */ |
726 | if(order > 6) { /* order == 7, 8 */ |
727 | __m128i qlp[4], dat[4]; |
728 | __m128i summ, temp; |
729 | qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
730 | qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
731 | qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
732 | if (order == 8) |
733 | qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
734 | else |
735 | qlp[3] = _mm_cvtsi32_si128(qlp_coeff[6]); |
736 | |
737 | qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); |
738 | qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); |
739 | qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1)); |
740 | qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1)); |
741 | |
742 | dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 ))); |
743 | dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 ))); |
744 | dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); |
745 | dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); |
746 | |
747 | summ = _mm_mul_epi32(dat[3], qlp[3]) ; |
748 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); |
749 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); |
750 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
751 | |
752 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
753 | summ = _mm_srl_epi64(summ, cnt); |
754 | temp = _mm_cvtsi32_si128(residual[0]); |
755 | temp = _mm_add_epi32(temp, summ); |
756 | data[0] = _mm_cvtsi128_si32(temp); |
757 | |
758 | for(i = 1; i < (int)data_len; i++) { |
759 | dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8); |
760 | dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); |
761 | dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); |
762 | dat[0] = _mm_alignr_epi8(temp, dat[0], 8); |
763 | |
764 | summ = _mm_mul_epi32(dat[3], qlp[3]) ; |
765 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); |
766 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); |
767 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
768 | |
769 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
770 | summ = _mm_srl_epi64(summ, cnt); |
771 | temp = _mm_cvtsi32_si128(residual[i]); |
772 | temp = _mm_add_epi32(temp, summ); |
773 | data[i] = _mm_cvtsi128_si32(temp); |
774 | } |
775 | } |
776 | else { /* order == 5, 6 */ |
777 | __m128i qlp[3], dat[3]; |
778 | __m128i summ, temp; |
779 | qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
780 | qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
781 | if (order == 6) |
782 | qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
783 | else |
784 | qlp[2] = _mm_cvtsi32_si128(qlp_coeff[4]); |
785 | |
786 | qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); |
787 | qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); |
788 | qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1)); |
789 | |
790 | dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 ))); |
791 | dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); |
792 | dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); |
793 | |
794 | summ = _mm_mul_epi32(dat[2], qlp[2]) ; |
795 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); |
796 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
797 | |
798 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
799 | summ = _mm_srl_epi64(summ, cnt); |
800 | temp = _mm_cvtsi32_si128(residual[0]); |
801 | temp = _mm_add_epi32(temp, summ); |
802 | data[0] = _mm_cvtsi128_si32(temp); |
803 | |
804 | for(i = 1; i < (int)data_len; i++) { |
805 | dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); |
806 | dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); |
807 | dat[0] = _mm_alignr_epi8(temp, dat[0], 8); |
808 | |
809 | summ = _mm_mul_epi32(dat[2], qlp[2]) ; |
810 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); |
811 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
812 | |
813 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
814 | summ = _mm_srl_epi64(summ, cnt); |
815 | temp = _mm_cvtsi32_si128(residual[i]); |
816 | temp = _mm_add_epi32(temp, summ); |
817 | data[i] = _mm_cvtsi128_si32(temp); |
818 | } |
819 | } |
820 | } |
821 | else { /* order == 1, 2, 3, 4 */ |
822 | if(order > 2) { /* order == 3, 4 */ |
823 | __m128i qlp[2], dat[2]; |
824 | __m128i summ, temp; |
825 | qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
826 | if (order == 4) |
827 | qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
828 | else |
829 | qlp[1] = _mm_cvtsi32_si128(qlp_coeff[2]); |
830 | |
831 | qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); |
832 | qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); |
833 | |
834 | dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); |
835 | dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); |
836 | |
837 | summ = _mm_mul_epi32(dat[1], qlp[1]) ; |
838 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
839 | |
840 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
841 | summ = _mm_srl_epi64(summ, cnt); |
842 | temp = _mm_cvtsi32_si128(residual[0]); |
843 | temp = _mm_add_epi32(temp, summ); |
844 | data[0] = _mm_cvtsi128_si32(temp); |
845 | |
846 | for(i = 1; i < (int)data_len; i++) { |
847 | dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); |
848 | dat[0] = _mm_alignr_epi8(temp, dat[0], 8); |
849 | |
850 | summ = _mm_mul_epi32(dat[1], qlp[1]) ; |
851 | summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); |
852 | |
853 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
854 | summ = _mm_srl_epi64(summ, cnt); |
855 | temp = _mm_cvtsi32_si128(residual[i]); |
856 | temp = _mm_add_epi32(temp, summ); |
857 | data[i] = _mm_cvtsi128_si32(temp); |
858 | } |
859 | } |
860 | else { /* order == 1, 2 */ |
861 | if(order == 2) { |
862 | __m128i qlp0, dat0; |
863 | __m128i summ, temp; |
864 | qlp0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff)); |
865 | qlp0 = _mm_shuffle_epi32(qlp0, _MM_SHUFFLE(2,0,3,1)); |
866 | |
867 | dat0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); |
868 | |
869 | summ = _mm_mul_epi32(dat0, qlp0) ; |
870 | |
871 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
872 | summ = _mm_srl_epi64(summ, cnt); |
873 | temp = _mm_cvtsi32_si128(residual[0]); |
874 | temp = _mm_add_epi32(temp, summ); |
875 | data[0] = _mm_cvtsi128_si32(temp); |
876 | |
877 | for(i = 1; i < (int)data_len; i++) { |
878 | dat0 = _mm_alignr_epi8(temp, dat0, 8); |
879 | |
880 | summ = _mm_mul_epi32(dat0, qlp0) ; |
881 | |
882 | summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); |
883 | summ = _mm_srl_epi64(summ, cnt); |
884 | temp = _mm_cvtsi32_si128(residual[i]); |
885 | temp = _mm_add_epi32(temp, summ); |
886 | data[i] = _mm_cvtsi128_si32(temp); |
887 | } |
888 | } |
889 | else { /* order == 1 */ |
890 | __m128i qlp0; |
891 | __m128i summ, temp; |
892 | qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]); |
893 | temp = _mm_cvtsi32_si128(data[-1]); |
894 | |
895 | summ = _mm_mul_epi32(temp, qlp0); |
896 | summ = _mm_srl_epi64(summ, cnt); |
897 | temp = _mm_cvtsi32_si128(residual[0]); |
898 | temp = _mm_add_epi32(temp, summ); |
899 | data[0] = _mm_cvtsi128_si32(temp); |
900 | |
901 | for(i = 1; i < (int)data_len; i++) { |
902 | summ = _mm_mul_epi32(temp, qlp0) ; |
903 | summ = _mm_srl_epi64(summ, cnt); |
904 | temp = _mm_cvtsi32_si128(residual[i]); |
905 | temp = _mm_add_epi32(temp, summ); |
906 | data[i] = _mm_cvtsi128_si32(temp); |
907 | } |
908 | } |
909 | } |
910 | } |
911 | } |
912 | else { /* order > 12 */ |
913 | FLAC__int64 sum; |
914 | for(i = 0; i < (int)data_len; i++) { |
915 | sum = 0; |
916 | switch(order) { |
917 | case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; |
918 | case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; |
919 | case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; |
920 | case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; |
921 | case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; |
922 | case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; |
923 | case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; |
924 | case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; |
925 | case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; |
926 | case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; |
927 | case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; |
928 | case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; |
929 | case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; |
930 | case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; |
931 | case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; |
932 | case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; |
933 | case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; |
934 | case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; |
935 | case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; |
936 | case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13]; |
937 | sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; |
938 | sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; |
939 | sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; |
940 | sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; |
941 | sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; |
942 | sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; |
943 | sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; |
944 | sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; |
945 | sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; |
946 | sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; |
947 | sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; |
948 | sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; |
949 | } |
950 | data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); |
951 | } |
952 | } |
953 | } |
954 | |
955 | #endif /* defined FLAC__CPU_IA32 */ |
956 | |
957 | FLAC__SSE_TARGET("sse4.1") |
958 | void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
959 | { |
960 | int i; |
961 | FLAC__int32 sum; |
962 | __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
963 | |
964 | FLAC__ASSERT(order > 0); |
965 | FLAC__ASSERT(order <= 32); |
966 | |
967 | if(order <= 12) { |
968 | if(order > 8) { |
969 | if(order > 10) { |
970 | if(order == 12) { |
971 | __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; |
972 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
973 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
974 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
975 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
976 | q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
977 | q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
978 | q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
979 | q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
980 | q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
981 | q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
982 | q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); |
983 | q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0)); |
984 | |
985 | for(i = 0; i < (int)data_len-3; i+=4) { |
986 | __m128i summ, mull; |
987 | summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(data+i-12))); |
988 | mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull); |
989 | mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); |
990 | mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
991 | mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
992 | mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
993 | mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
994 | mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
995 | mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
996 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
997 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
998 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
999 | summ = _mm_sra_epi32(summ, cnt); |
1000 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1001 | } |
1002 | } |
1003 | else { /* order == 11 */ |
1004 | __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; |
1005 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1006 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1007 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1008 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
1009 | q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
1010 | q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
1011 | q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
1012 | q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
1013 | q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
1014 | q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
1015 | q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); |
1016 | |
1017 | for(i = 0; i < (int)data_len-3; i+=4) { |
1018 | __m128i summ, mull; |
1019 | summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); |
1020 | mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); |
1021 | mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
1022 | mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
1023 | mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
1024 | mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
1025 | mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
1026 | mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
1027 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
1028 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1029 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1030 | summ = _mm_sra_epi32(summ, cnt); |
1031 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1032 | } |
1033 | } |
1034 | } |
1035 | else { |
1036 | if(order == 10) { |
1037 | __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; |
1038 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1039 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1040 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1041 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
1042 | q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
1043 | q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
1044 | q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
1045 | q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
1046 | q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
1047 | q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
1048 | |
1049 | for(i = 0; i < (int)data_len-3; i+=4) { |
1050 | __m128i summ, mull; |
1051 | summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); |
1052 | mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
1053 | mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
1054 | mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
1055 | mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
1056 | mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
1057 | mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
1058 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
1059 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1060 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1061 | summ = _mm_sra_epi32(summ, cnt); |
1062 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1063 | } |
1064 | } |
1065 | else { /* order == 9 */ |
1066 | __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8; |
1067 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1068 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1069 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1070 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
1071 | q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
1072 | q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
1073 | q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
1074 | q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
1075 | q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
1076 | |
1077 | for(i = 0; i < (int)data_len-3; i+=4) { |
1078 | __m128i summ, mull; |
1079 | summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); |
1080 | mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
1081 | mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
1082 | mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
1083 | mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
1084 | mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
1085 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
1086 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1087 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1088 | summ = _mm_sra_epi32(summ, cnt); |
1089 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1090 | } |
1091 | } |
1092 | } |
1093 | } |
1094 | else if(order > 4) { |
1095 | if(order > 6) { |
1096 | if(order == 8) { |
1097 | __m128i q0, q1, q2, q3, q4, q5, q6, q7; |
1098 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1099 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1100 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1101 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
1102 | q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
1103 | q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
1104 | q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
1105 | q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
1106 | |
1107 | for(i = 0; i < (int)data_len-3; i+=4) { |
1108 | __m128i summ, mull; |
1109 | summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); |
1110 | mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
1111 | mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
1112 | mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
1113 | mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
1114 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
1115 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1116 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1117 | summ = _mm_sra_epi32(summ, cnt); |
1118 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1119 | } |
1120 | } |
1121 | else { /* order == 7 */ |
1122 | __m128i q0, q1, q2, q3, q4, q5, q6; |
1123 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1124 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1125 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1126 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
1127 | q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
1128 | q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
1129 | q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
1130 | |
1131 | for(i = 0; i < (int)data_len-3; i+=4) { |
1132 | __m128i summ, mull; |
1133 | summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); |
1134 | mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
1135 | mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
1136 | mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
1137 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
1138 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1139 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1140 | summ = _mm_sra_epi32(summ, cnt); |
1141 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1142 | } |
1143 | } |
1144 | } |
1145 | else { |
1146 | if(order == 6) { |
1147 | __m128i q0, q1, q2, q3, q4, q5; |
1148 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1149 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1150 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1151 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
1152 | q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
1153 | q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
1154 | |
1155 | for(i = 0; i < (int)data_len-3; i+=4) { |
1156 | __m128i summ, mull; |
1157 | summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); |
1158 | mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
1159 | mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
1160 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
1161 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1162 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1163 | summ = _mm_sra_epi32(summ, cnt); |
1164 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1165 | } |
1166 | } |
1167 | else { /* order == 5 */ |
1168 | __m128i q0, q1, q2, q3, q4; |
1169 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1170 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1171 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1172 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
1173 | q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
1174 | |
1175 | for(i = 0; i < (int)data_len-3; i+=4) { |
1176 | __m128i summ, mull; |
1177 | summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); |
1178 | mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
1179 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
1180 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1181 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1182 | summ = _mm_sra_epi32(summ, cnt); |
1183 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1184 | } |
1185 | } |
1186 | } |
1187 | } |
1188 | else { |
1189 | if(order > 2) { |
1190 | if(order == 4) { |
1191 | __m128i q0, q1, q2, q3; |
1192 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1193 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1194 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1195 | q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
1196 | |
1197 | for(i = 0; i < (int)data_len-3; i+=4) { |
1198 | __m128i summ, mull; |
1199 | summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); |
1200 | mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
1201 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1202 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1203 | summ = _mm_sra_epi32(summ, cnt); |
1204 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1205 | } |
1206 | } |
1207 | else { /* order == 3 */ |
1208 | __m128i q0, q1, q2; |
1209 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1210 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1211 | q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
1212 | |
1213 | for(i = 0; i < (int)data_len-3; i+=4) { |
1214 | __m128i summ, mull; |
1215 | summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); |
1216 | mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
1217 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1218 | summ = _mm_sra_epi32(summ, cnt); |
1219 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1220 | } |
1221 | } |
1222 | } |
1223 | else { |
1224 | if(order == 2) { |
1225 | __m128i q0, q1; |
1226 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1227 | q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
1228 | |
1229 | for(i = 0; i < (int)data_len-3; i+=4) { |
1230 | __m128i summ, mull; |
1231 | summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); |
1232 | mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
1233 | summ = _mm_sra_epi32(summ, cnt); |
1234 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1235 | } |
1236 | } |
1237 | else { /* order == 1 */ |
1238 | __m128i q0; |
1239 | q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
1240 | |
1241 | for(i = 0; i < (int)data_len-3; i+=4) { |
1242 | __m128i summ; |
1243 | summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); |
1244 | summ = _mm_sra_epi32(summ, cnt); |
1245 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
1246 | } |
1247 | } |
1248 | } |
1249 | } |
1250 | for(; i < (int)data_len; i++) { |
1251 | sum = 0; |
1252 | switch(order) { |
1253 | case 12: sum += qlp_coeff[11] * data[i-12]; |
1254 | case 11: sum += qlp_coeff[10] * data[i-11]; |
1255 | case 10: sum += qlp_coeff[ 9] * data[i-10]; |
1256 | case 9: sum += qlp_coeff[ 8] * data[i- 9]; |
1257 | case 8: sum += qlp_coeff[ 7] * data[i- 8]; |
1258 | case 7: sum += qlp_coeff[ 6] * data[i- 7]; |
1259 | case 6: sum += qlp_coeff[ 5] * data[i- 6]; |
1260 | case 5: sum += qlp_coeff[ 4] * data[i- 5]; |
1261 | case 4: sum += qlp_coeff[ 3] * data[i- 4]; |
1262 | case 3: sum += qlp_coeff[ 2] * data[i- 3]; |
1263 | case 2: sum += qlp_coeff[ 1] * data[i- 2]; |
1264 | case 1: sum += qlp_coeff[ 0] * data[i- 1]; |
1265 | } |
1266 | residual[i] = data[i] - (sum >> lp_quantization); |
1267 | } |
1268 | } |
1269 | else { /* order > 12 */ |
1270 | for(i = 0; i < (int)data_len; i++) { |
1271 | sum = 0; |
1272 | switch(order) { |
1273 | case 32: sum += qlp_coeff[31] * data[i-32]; |
1274 | case 31: sum += qlp_coeff[30] * data[i-31]; |
1275 | case 30: sum += qlp_coeff[29] * data[i-30]; |
1276 | case 29: sum += qlp_coeff[28] * data[i-29]; |
1277 | case 28: sum += qlp_coeff[27] * data[i-28]; |
1278 | case 27: sum += qlp_coeff[26] * data[i-27]; |
1279 | case 26: sum += qlp_coeff[25] * data[i-26]; |
1280 | case 25: sum += qlp_coeff[24] * data[i-25]; |
1281 | case 24: sum += qlp_coeff[23] * data[i-24]; |
1282 | case 23: sum += qlp_coeff[22] * data[i-23]; |
1283 | case 22: sum += qlp_coeff[21] * data[i-22]; |
1284 | case 21: sum += qlp_coeff[20] * data[i-21]; |
1285 | case 20: sum += qlp_coeff[19] * data[i-20]; |
1286 | case 19: sum += qlp_coeff[18] * data[i-19]; |
1287 | case 18: sum += qlp_coeff[17] * data[i-18]; |
1288 | case 17: sum += qlp_coeff[16] * data[i-17]; |
1289 | case 16: sum += qlp_coeff[15] * data[i-16]; |
1290 | case 15: sum += qlp_coeff[14] * data[i-15]; |
1291 | case 14: sum += qlp_coeff[13] * data[i-14]; |
1292 | case 13: sum += qlp_coeff[12] * data[i-13]; |
1293 | sum += qlp_coeff[11] * data[i-12]; |
1294 | sum += qlp_coeff[10] * data[i-11]; |
1295 | sum += qlp_coeff[ 9] * data[i-10]; |
1296 | sum += qlp_coeff[ 8] * data[i- 9]; |
1297 | sum += qlp_coeff[ 7] * data[i- 8]; |
1298 | sum += qlp_coeff[ 6] * data[i- 7]; |
1299 | sum += qlp_coeff[ 5] * data[i- 6]; |
1300 | sum += qlp_coeff[ 4] * data[i- 5]; |
1301 | sum += qlp_coeff[ 3] * data[i- 4]; |
1302 | sum += qlp_coeff[ 2] * data[i- 3]; |
1303 | sum += qlp_coeff[ 1] * data[i- 2]; |
1304 | sum += qlp_coeff[ 0] * data[i- 1]; |
1305 | } |
1306 | residual[i] = data[i] - (sum >> lp_quantization); |
1307 | } |
1308 | } |
1309 | } |
1310 | |
1311 | #endif /* FLAC__SSE4_1_SUPPORTED */ |
1312 | #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ |
1313 | #endif /* FLAC__NO_ASM */ |
1314 | #endif /* FLAC__INTEGER_ONLY_LIBRARY */ |