ce188d4d |
1 | /* libFLAC - Free Lossless Audio Codec library |
2 | * Copyright (C) 2000-2009 Josh Coalson |
3 | * Copyright (C) 2011-2016 Xiph.Org Foundation |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * |
9 | * - Redistributions of source code must retain the above copyright |
10 | * notice, this list of conditions and the following disclaimer. |
11 | * |
12 | * - Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * - Neither the name of the Xiph.org Foundation nor the names of its |
17 | * contributors may be used to endorse or promote products derived from |
18 | * this software without specific prior written permission. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
23 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
24 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
25 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
26 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
27 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
28 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
29 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
30 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
31 | */ |
32 | |
33 | #ifdef HAVE_CONFIG_H |
34 | # include <config.h> |
35 | #endif |
36 | |
37 | #include "private/cpu.h" |
38 | |
39 | #ifndef FLAC__INTEGER_ONLY_LIBRARY |
40 | #ifndef FLAC__NO_ASM |
41 | #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN |
42 | #include "private/lpc.h" |
43 | #ifdef FLAC__AVX2_SUPPORTED |
44 | |
45 | #include "FLAC/assert.h" |
46 | #include "FLAC/format.h" |
47 | |
48 | #include <immintrin.h> /* AVX2 */ |
49 | |
50 | FLAC__SSE_TARGET("avx2") |
51 | void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
52 | { |
53 | int i; |
54 | FLAC__int32 sum; |
55 | __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
56 | |
57 | FLAC__ASSERT(order > 0); |
58 | FLAC__ASSERT(order <= 32); |
59 | |
60 | if(order <= 12) { |
61 | if(order > 8) { |
62 | if(order > 10) { |
63 | if(order == 12) { |
64 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; |
65 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
66 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
67 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
68 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
69 | q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); |
70 | q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); |
71 | q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); |
72 | q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); |
73 | q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); |
74 | q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); |
75 | q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]); |
76 | q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]); |
77 | |
78 | for(i = 0; i < (int)data_len-7; i+=8) { |
79 | __m256i summ, mull; |
80 | summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12))); |
81 | mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull); |
82 | mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); |
83 | mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); |
84 | mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); |
85 | mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); |
86 | mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); |
87 | mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); |
88 | mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); |
89 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
90 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
91 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
92 | summ = _mm256_sra_epi32(summ, cnt); |
93 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
94 | } |
95 | } |
96 | else { /* order == 11 */ |
97 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; |
98 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
99 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
100 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
101 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
102 | q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); |
103 | q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); |
104 | q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); |
105 | q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); |
106 | q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); |
107 | q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); |
108 | q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]); |
109 | |
110 | for(i = 0; i < (int)data_len-7; i+=8) { |
111 | __m256i summ, mull; |
112 | summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); |
113 | mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); |
114 | mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); |
115 | mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); |
116 | mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); |
117 | mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); |
118 | mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); |
119 | mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); |
120 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
121 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
122 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
123 | summ = _mm256_sra_epi32(summ, cnt); |
124 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
125 | } |
126 | } |
127 | } |
128 | else { |
129 | if(order == 10) { |
130 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; |
131 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
132 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
133 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
134 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
135 | q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); |
136 | q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); |
137 | q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); |
138 | q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); |
139 | q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); |
140 | q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); |
141 | |
142 | for(i = 0; i < (int)data_len-7; i+=8) { |
143 | __m256i summ, mull; |
144 | summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); |
145 | mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); |
146 | mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); |
147 | mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); |
148 | mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); |
149 | mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); |
150 | mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); |
151 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
152 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
153 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
154 | summ = _mm256_sra_epi32(summ, cnt); |
155 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
156 | } |
157 | } |
158 | else { /* order == 9 */ |
159 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; |
160 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
161 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
162 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
163 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
164 | q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); |
165 | q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); |
166 | q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); |
167 | q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); |
168 | q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); |
169 | |
170 | for(i = 0; i < (int)data_len-7; i+=8) { |
171 | __m256i summ, mull; |
172 | summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); |
173 | mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); |
174 | mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); |
175 | mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); |
176 | mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); |
177 | mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); |
178 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
179 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
180 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
181 | summ = _mm256_sra_epi32(summ, cnt); |
182 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
183 | } |
184 | } |
185 | } |
186 | } |
187 | else if(order > 4) { |
188 | if(order > 6) { |
189 | if(order == 8) { |
190 | __m256i q0, q1, q2, q3, q4, q5, q6, q7; |
191 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
192 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
193 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
194 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
195 | q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); |
196 | q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); |
197 | q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); |
198 | q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); |
199 | |
200 | for(i = 0; i < (int)data_len-7; i+=8) { |
201 | __m256i summ, mull; |
202 | summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); |
203 | mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); |
204 | mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); |
205 | mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); |
206 | mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); |
207 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
208 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
209 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
210 | summ = _mm256_sra_epi32(summ, cnt); |
211 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
212 | } |
213 | } |
214 | else { /* order == 7 */ |
215 | __m256i q0, q1, q2, q3, q4, q5, q6; |
216 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
217 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
218 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
219 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
220 | q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); |
221 | q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); |
222 | q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); |
223 | |
224 | for(i = 0; i < (int)data_len-7; i+=8) { |
225 | __m256i summ, mull; |
226 | summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); |
227 | mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); |
228 | mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); |
229 | mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); |
230 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
231 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
232 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
233 | summ = _mm256_sra_epi32(summ, cnt); |
234 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
235 | } |
236 | } |
237 | } |
238 | else { |
239 | if(order == 6) { |
240 | __m256i q0, q1, q2, q3, q4, q5; |
241 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
242 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
243 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
244 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
245 | q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); |
246 | q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); |
247 | |
248 | for(i = 0; i < (int)data_len-7; i+=8) { |
249 | __m256i summ, mull; |
250 | summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); |
251 | mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); |
252 | mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); |
253 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
254 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
255 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
256 | summ = _mm256_sra_epi32(summ, cnt); |
257 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
258 | } |
259 | } |
260 | else { /* order == 5 */ |
261 | __m256i q0, q1, q2, q3, q4; |
262 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
263 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
264 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
265 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
266 | q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); |
267 | |
268 | for(i = 0; i < (int)data_len-7; i+=8) { |
269 | __m256i summ, mull; |
270 | summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); |
271 | mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); |
272 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
273 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
274 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
275 | summ = _mm256_sra_epi32(summ, cnt); |
276 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
277 | } |
278 | } |
279 | } |
280 | } |
281 | else { |
282 | if(order > 2) { |
283 | if(order == 4) { |
284 | __m256i q0, q1, q2, q3; |
285 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
286 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
287 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
288 | q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); |
289 | |
290 | for(i = 0; i < (int)data_len-7; i+=8) { |
291 | __m256i summ, mull; |
292 | summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); |
293 | mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); |
294 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
295 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
296 | summ = _mm256_sra_epi32(summ, cnt); |
297 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
298 | } |
299 | } |
300 | else { /* order == 3 */ |
301 | __m256i q0, q1, q2; |
302 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
303 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
304 | q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); |
305 | |
306 | for(i = 0; i < (int)data_len-7; i+=8) { |
307 | __m256i summ, mull; |
308 | summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); |
309 | mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); |
310 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
311 | summ = _mm256_sra_epi32(summ, cnt); |
312 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
313 | } |
314 | } |
315 | } |
316 | else { |
317 | if(order == 2) { |
318 | __m256i q0, q1; |
319 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
320 | q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); |
321 | |
322 | for(i = 0; i < (int)data_len-7; i+=8) { |
323 | __m256i summ, mull; |
324 | summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); |
325 | mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); |
326 | summ = _mm256_sra_epi32(summ, cnt); |
327 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
328 | } |
329 | } |
330 | else { /* order == 1 */ |
331 | __m256i q0; |
332 | q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); |
333 | |
334 | for(i = 0; i < (int)data_len-7; i+=8) { |
335 | __m256i summ; |
336 | summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); |
337 | summ = _mm256_sra_epi32(summ, cnt); |
338 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
339 | } |
340 | } |
341 | } |
342 | } |
343 | for(; i < (int)data_len; i++) { |
344 | sum = 0; |
345 | switch(order) { |
346 | case 12: sum += qlp_coeff[11] * data[i-12]; |
347 | case 11: sum += qlp_coeff[10] * data[i-11]; |
348 | case 10: sum += qlp_coeff[ 9] * data[i-10]; |
349 | case 9: sum += qlp_coeff[ 8] * data[i- 9]; |
350 | case 8: sum += qlp_coeff[ 7] * data[i- 8]; |
351 | case 7: sum += qlp_coeff[ 6] * data[i- 7]; |
352 | case 6: sum += qlp_coeff[ 5] * data[i- 6]; |
353 | case 5: sum += qlp_coeff[ 4] * data[i- 5]; |
354 | case 4: sum += qlp_coeff[ 3] * data[i- 4]; |
355 | case 3: sum += qlp_coeff[ 2] * data[i- 3]; |
356 | case 2: sum += qlp_coeff[ 1] * data[i- 2]; |
357 | case 1: sum += qlp_coeff[ 0] * data[i- 1]; |
358 | } |
359 | residual[i] = data[i] - (sum >> lp_quantization); |
360 | } |
361 | } |
362 | else { /* order > 12 */ |
363 | for(i = 0; i < (int)data_len; i++) { |
364 | sum = 0; |
365 | switch(order) { |
366 | case 32: sum += qlp_coeff[31] * data[i-32]; |
367 | case 31: sum += qlp_coeff[30] * data[i-31]; |
368 | case 30: sum += qlp_coeff[29] * data[i-30]; |
369 | case 29: sum += qlp_coeff[28] * data[i-29]; |
370 | case 28: sum += qlp_coeff[27] * data[i-28]; |
371 | case 27: sum += qlp_coeff[26] * data[i-27]; |
372 | case 26: sum += qlp_coeff[25] * data[i-26]; |
373 | case 25: sum += qlp_coeff[24] * data[i-25]; |
374 | case 24: sum += qlp_coeff[23] * data[i-24]; |
375 | case 23: sum += qlp_coeff[22] * data[i-23]; |
376 | case 22: sum += qlp_coeff[21] * data[i-22]; |
377 | case 21: sum += qlp_coeff[20] * data[i-21]; |
378 | case 20: sum += qlp_coeff[19] * data[i-20]; |
379 | case 19: sum += qlp_coeff[18] * data[i-19]; |
380 | case 18: sum += qlp_coeff[17] * data[i-18]; |
381 | case 17: sum += qlp_coeff[16] * data[i-17]; |
382 | case 16: sum += qlp_coeff[15] * data[i-16]; |
383 | case 15: sum += qlp_coeff[14] * data[i-15]; |
384 | case 14: sum += qlp_coeff[13] * data[i-14]; |
385 | case 13: sum += qlp_coeff[12] * data[i-13]; |
386 | sum += qlp_coeff[11] * data[i-12]; |
387 | sum += qlp_coeff[10] * data[i-11]; |
388 | sum += qlp_coeff[ 9] * data[i-10]; |
389 | sum += qlp_coeff[ 8] * data[i- 9]; |
390 | sum += qlp_coeff[ 7] * data[i- 8]; |
391 | sum += qlp_coeff[ 6] * data[i- 7]; |
392 | sum += qlp_coeff[ 5] * data[i- 6]; |
393 | sum += qlp_coeff[ 4] * data[i- 5]; |
394 | sum += qlp_coeff[ 3] * data[i- 4]; |
395 | sum += qlp_coeff[ 2] * data[i- 3]; |
396 | sum += qlp_coeff[ 1] * data[i- 2]; |
397 | sum += qlp_coeff[ 0] * data[i- 1]; |
398 | } |
399 | residual[i] = data[i] - (sum >> lp_quantization); |
400 | } |
401 | } |
402 | _mm256_zeroupper(); |
403 | } |
404 | |
405 | FLAC__SSE_TARGET("avx2") |
406 | void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
407 | { |
408 | int i; |
409 | FLAC__int32 sum; |
410 | __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
411 | |
412 | FLAC__ASSERT(order > 0); |
413 | FLAC__ASSERT(order <= 32); |
414 | |
415 | if(order <= 12) { |
416 | if(order > 8) { |
417 | if(order > 10) { |
418 | if(order == 12) { |
419 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; |
420 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
421 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
422 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
423 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
424 | q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
425 | q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
426 | q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
427 | q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
428 | q8 = _mm256_set1_epi32(qlp_coeff[8 ]); |
429 | q9 = _mm256_set1_epi32(qlp_coeff[9 ]); |
430 | q10 = _mm256_set1_epi32(qlp_coeff[10]); |
431 | q11 = _mm256_set1_epi32(qlp_coeff[11]); |
432 | |
433 | for(i = 0; i < (int)data_len-7; i+=8) { |
434 | __m256i summ, mull; |
435 | summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12))); |
436 | mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull); |
437 | mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); |
438 | mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); |
439 | mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); |
440 | mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); |
441 | mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); |
442 | mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); |
443 | mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); |
444 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
445 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
446 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
447 | summ = _mm256_sra_epi32(summ, cnt); |
448 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
449 | } |
450 | } |
451 | else { /* order == 11 */ |
452 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; |
453 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
454 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
455 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
456 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
457 | q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
458 | q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
459 | q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
460 | q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
461 | q8 = _mm256_set1_epi32(qlp_coeff[8 ]); |
462 | q9 = _mm256_set1_epi32(qlp_coeff[9 ]); |
463 | q10 = _mm256_set1_epi32(qlp_coeff[10]); |
464 | |
465 | for(i = 0; i < (int)data_len-7; i+=8) { |
466 | __m256i summ, mull; |
467 | summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); |
468 | mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); |
469 | mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); |
470 | mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); |
471 | mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); |
472 | mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); |
473 | mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); |
474 | mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); |
475 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
476 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
477 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
478 | summ = _mm256_sra_epi32(summ, cnt); |
479 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
480 | } |
481 | } |
482 | } |
483 | else { |
484 | if(order == 10) { |
485 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; |
486 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
487 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
488 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
489 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
490 | q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
491 | q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
492 | q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
493 | q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
494 | q8 = _mm256_set1_epi32(qlp_coeff[8 ]); |
495 | q9 = _mm256_set1_epi32(qlp_coeff[9 ]); |
496 | |
497 | for(i = 0; i < (int)data_len-7; i+=8) { |
498 | __m256i summ, mull; |
499 | summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); |
500 | mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); |
501 | mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); |
502 | mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); |
503 | mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); |
504 | mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); |
505 | mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); |
506 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
507 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
508 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
509 | summ = _mm256_sra_epi32(summ, cnt); |
510 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
511 | } |
512 | } |
513 | else { /* order == 9 */ |
514 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; |
515 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
516 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
517 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
518 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
519 | q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
520 | q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
521 | q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
522 | q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
523 | q8 = _mm256_set1_epi32(qlp_coeff[8 ]); |
524 | |
525 | for(i = 0; i < (int)data_len-7; i+=8) { |
526 | __m256i summ, mull; |
527 | summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); |
528 | mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); |
529 | mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); |
530 | mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); |
531 | mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); |
532 | mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); |
533 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
534 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
535 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
536 | summ = _mm256_sra_epi32(summ, cnt); |
537 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
538 | } |
539 | } |
540 | } |
541 | } |
542 | else if(order > 4) { |
543 | if(order > 6) { |
544 | if(order == 8) { |
545 | __m256i q0, q1, q2, q3, q4, q5, q6, q7; |
546 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
547 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
548 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
549 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
550 | q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
551 | q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
552 | q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
553 | q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
554 | |
555 | for(i = 0; i < (int)data_len-7; i+=8) { |
556 | __m256i summ, mull; |
557 | summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); |
558 | mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); |
559 | mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); |
560 | mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); |
561 | mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); |
562 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
563 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
564 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
565 | summ = _mm256_sra_epi32(summ, cnt); |
566 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
567 | } |
568 | } |
569 | else { /* order == 7 */ |
570 | __m256i q0, q1, q2, q3, q4, q5, q6; |
571 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
572 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
573 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
574 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
575 | q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
576 | q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
577 | q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
578 | |
579 | for(i = 0; i < (int)data_len-7; i+=8) { |
580 | __m256i summ, mull; |
581 | summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); |
582 | mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); |
583 | mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); |
584 | mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); |
585 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
586 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
587 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
588 | summ = _mm256_sra_epi32(summ, cnt); |
589 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
590 | } |
591 | } |
592 | } |
593 | else { |
594 | if(order == 6) { |
595 | __m256i q0, q1, q2, q3, q4, q5; |
596 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
597 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
598 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
599 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
600 | q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
601 | q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
602 | |
603 | for(i = 0; i < (int)data_len-7; i+=8) { |
604 | __m256i summ, mull; |
605 | summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); |
606 | mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); |
607 | mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); |
608 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
609 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
610 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
611 | summ = _mm256_sra_epi32(summ, cnt); |
612 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
613 | } |
614 | } |
615 | else { /* order == 5 */ |
616 | __m256i q0, q1, q2, q3, q4; |
617 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
618 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
619 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
620 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
621 | q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
622 | |
623 | for(i = 0; i < (int)data_len-7; i+=8) { |
624 | __m256i summ, mull; |
625 | summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); |
626 | mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); |
627 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
628 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
629 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
630 | summ = _mm256_sra_epi32(summ, cnt); |
631 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
632 | } |
633 | } |
634 | } |
635 | } |
636 | else { |
637 | if(order > 2) { |
638 | if(order == 4) { |
639 | __m256i q0, q1, q2, q3; |
640 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
641 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
642 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
643 | q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
644 | |
645 | for(i = 0; i < (int)data_len-7; i+=8) { |
646 | __m256i summ, mull; |
647 | summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); |
648 | mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); |
649 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
650 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
651 | summ = _mm256_sra_epi32(summ, cnt); |
652 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
653 | } |
654 | } |
655 | else { /* order == 3 */ |
656 | __m256i q0, q1, q2; |
657 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
658 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
659 | q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
660 | |
661 | for(i = 0; i < (int)data_len-7; i+=8) { |
662 | __m256i summ, mull; |
663 | summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); |
664 | mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); |
665 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
666 | summ = _mm256_sra_epi32(summ, cnt); |
667 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
668 | } |
669 | } |
670 | } |
671 | else { |
672 | if(order == 2) { |
673 | __m256i q0, q1; |
674 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
675 | q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
676 | |
677 | for(i = 0; i < (int)data_len-7; i+=8) { |
678 | __m256i summ, mull; |
679 | summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); |
680 | mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); |
681 | summ = _mm256_sra_epi32(summ, cnt); |
682 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
683 | } |
684 | } |
685 | else { /* order == 1 */ |
686 | __m256i q0; |
687 | q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
688 | |
689 | for(i = 0; i < (int)data_len-7; i+=8) { |
690 | __m256i summ; |
691 | summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); |
692 | summ = _mm256_sra_epi32(summ, cnt); |
693 | _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); |
694 | } |
695 | } |
696 | } |
697 | } |
698 | for(; i < (int)data_len; i++) { |
699 | sum = 0; |
700 | switch(order) { |
701 | case 12: sum += qlp_coeff[11] * data[i-12]; |
702 | case 11: sum += qlp_coeff[10] * data[i-11]; |
703 | case 10: sum += qlp_coeff[ 9] * data[i-10]; |
704 | case 9: sum += qlp_coeff[ 8] * data[i- 9]; |
705 | case 8: sum += qlp_coeff[ 7] * data[i- 8]; |
706 | case 7: sum += qlp_coeff[ 6] * data[i- 7]; |
707 | case 6: sum += qlp_coeff[ 5] * data[i- 6]; |
708 | case 5: sum += qlp_coeff[ 4] * data[i- 5]; |
709 | case 4: sum += qlp_coeff[ 3] * data[i- 4]; |
710 | case 3: sum += qlp_coeff[ 2] * data[i- 3]; |
711 | case 2: sum += qlp_coeff[ 1] * data[i- 2]; |
712 | case 1: sum += qlp_coeff[ 0] * data[i- 1]; |
713 | } |
714 | residual[i] = data[i] - (sum >> lp_quantization); |
715 | } |
716 | } |
717 | else { /* order > 12 */ |
718 | for(i = 0; i < (int)data_len; i++) { |
719 | sum = 0; |
720 | switch(order) { |
721 | case 32: sum += qlp_coeff[31] * data[i-32]; |
722 | case 31: sum += qlp_coeff[30] * data[i-31]; |
723 | case 30: sum += qlp_coeff[29] * data[i-30]; |
724 | case 29: sum += qlp_coeff[28] * data[i-29]; |
725 | case 28: sum += qlp_coeff[27] * data[i-28]; |
726 | case 27: sum += qlp_coeff[26] * data[i-27]; |
727 | case 26: sum += qlp_coeff[25] * data[i-26]; |
728 | case 25: sum += qlp_coeff[24] * data[i-25]; |
729 | case 24: sum += qlp_coeff[23] * data[i-24]; |
730 | case 23: sum += qlp_coeff[22] * data[i-23]; |
731 | case 22: sum += qlp_coeff[21] * data[i-22]; |
732 | case 21: sum += qlp_coeff[20] * data[i-21]; |
733 | case 20: sum += qlp_coeff[19] * data[i-20]; |
734 | case 19: sum += qlp_coeff[18] * data[i-19]; |
735 | case 18: sum += qlp_coeff[17] * data[i-18]; |
736 | case 17: sum += qlp_coeff[16] * data[i-17]; |
737 | case 16: sum += qlp_coeff[15] * data[i-16]; |
738 | case 15: sum += qlp_coeff[14] * data[i-15]; |
739 | case 14: sum += qlp_coeff[13] * data[i-14]; |
740 | case 13: sum += qlp_coeff[12] * data[i-13]; |
741 | sum += qlp_coeff[11] * data[i-12]; |
742 | sum += qlp_coeff[10] * data[i-11]; |
743 | sum += qlp_coeff[ 9] * data[i-10]; |
744 | sum += qlp_coeff[ 8] * data[i- 9]; |
745 | sum += qlp_coeff[ 7] * data[i- 8]; |
746 | sum += qlp_coeff[ 6] * data[i- 7]; |
747 | sum += qlp_coeff[ 5] * data[i- 6]; |
748 | sum += qlp_coeff[ 4] * data[i- 5]; |
749 | sum += qlp_coeff[ 3] * data[i- 4]; |
750 | sum += qlp_coeff[ 2] * data[i- 3]; |
751 | sum += qlp_coeff[ 1] * data[i- 2]; |
752 | sum += qlp_coeff[ 0] * data[i- 1]; |
753 | } |
754 | residual[i] = data[i] - (sum >> lp_quantization); |
755 | } |
756 | } |
757 | _mm256_zeroupper(); |
758 | } |
759 | |
760 | static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 }; |
761 | |
762 | FLAC__SSE_TARGET("avx2") |
763 | void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
764 | { |
765 | int i; |
766 | FLAC__int64 sum; |
767 | __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
768 | __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr); |
769 | |
770 | FLAC__ASSERT(order > 0); |
771 | FLAC__ASSERT(order <= 32); |
772 | FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */ |
773 | |
774 | if(order <= 12) { |
775 | if(order > 8) { |
776 | if(order > 10) { |
777 | if(order == 12) { |
778 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; |
779 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
780 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
781 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
782 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
783 | q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); |
784 | q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); |
785 | q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); |
786 | q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); |
787 | q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); |
788 | q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); |
789 | q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10])); |
790 | q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11])); |
791 | |
792 | for(i = 0; i < (int)data_len-3; i+=4) { |
793 | __m256i summ, mull; |
794 | summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12)))); |
795 | mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull); |
796 | mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull); |
797 | mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); |
798 | mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); |
799 | mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); |
800 | mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); |
801 | mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); |
802 | mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); |
803 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
804 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
805 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
806 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
807 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
808 | } |
809 | } |
810 | else { /* order == 11 */ |
811 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; |
812 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
813 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
814 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
815 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
816 | q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); |
817 | q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); |
818 | q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); |
819 | q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); |
820 | q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); |
821 | q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); |
822 | q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10])); |
823 | |
824 | for(i = 0; i < (int)data_len-3; i+=4) { |
825 | __m256i summ, mull; |
826 | summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); |
827 | mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull); |
828 | mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); |
829 | mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); |
830 | mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); |
831 | mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); |
832 | mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); |
833 | mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); |
834 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
835 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
836 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
837 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
838 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
839 | } |
840 | } |
841 | } |
842 | else { |
843 | if(order == 10) { |
844 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; |
845 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
846 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
847 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
848 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
849 | q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); |
850 | q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); |
851 | q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); |
852 | q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); |
853 | q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); |
854 | q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); |
855 | |
856 | for(i = 0; i < (int)data_len-3; i+=4) { |
857 | __m256i summ, mull; |
858 | summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); |
859 | mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); |
860 | mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); |
861 | mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); |
862 | mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); |
863 | mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); |
864 | mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); |
865 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
866 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
867 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
868 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
869 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
870 | } |
871 | } |
872 | else { /* order == 9 */ |
873 | __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; |
874 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
875 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
876 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
877 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
878 | q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); |
879 | q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); |
880 | q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); |
881 | q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); |
882 | q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); |
883 | |
884 | for(i = 0; i < (int)data_len-3; i+=4) { |
885 | __m256i summ, mull; |
886 | summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); |
887 | mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); |
888 | mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); |
889 | mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); |
890 | mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); |
891 | mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); |
892 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
893 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
894 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
895 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
896 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
897 | } |
898 | } |
899 | } |
900 | } |
901 | else if(order > 4) { |
902 | if(order > 6) { |
903 | if(order == 8) { |
904 | __m256i q0, q1, q2, q3, q4, q5, q6, q7; |
905 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
906 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
907 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
908 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
909 | q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); |
910 | q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); |
911 | q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); |
912 | q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); |
913 | |
914 | for(i = 0; i < (int)data_len-3; i+=4) { |
915 | __m256i summ, mull; |
916 | summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); |
917 | mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); |
918 | mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); |
919 | mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); |
920 | mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); |
921 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
922 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
923 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
924 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
925 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
926 | } |
927 | } |
928 | else { /* order == 7 */ |
929 | __m256i q0, q1, q2, q3, q4, q5, q6; |
930 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
931 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
932 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
933 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
934 | q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); |
935 | q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); |
936 | q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); |
937 | |
938 | for(i = 0; i < (int)data_len-3; i+=4) { |
939 | __m256i summ, mull; |
940 | summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); |
941 | mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); |
942 | mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); |
943 | mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); |
944 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
945 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
946 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
947 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
948 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
949 | } |
950 | } |
951 | } |
952 | else { |
953 | if(order == 6) { |
954 | __m256i q0, q1, q2, q3, q4, q5; |
955 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
956 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
957 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
958 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
959 | q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); |
960 | q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); |
961 | |
962 | for(i = 0; i < (int)data_len-3; i+=4) { |
963 | __m256i summ, mull; |
964 | summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); |
965 | mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); |
966 | mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); |
967 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
968 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
969 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
970 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
971 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
972 | } |
973 | } |
974 | else { /* order == 5 */ |
975 | __m256i q0, q1, q2, q3, q4; |
976 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
977 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
978 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
979 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
980 | q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); |
981 | |
982 | for(i = 0; i < (int)data_len-3; i+=4) { |
983 | __m256i summ, mull; |
984 | summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); |
985 | mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); |
986 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
987 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
988 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
989 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
990 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
991 | } |
992 | } |
993 | } |
994 | } |
995 | else { |
996 | if(order > 2) { |
997 | if(order == 4) { |
998 | __m256i q0, q1, q2, q3; |
999 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
1000 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
1001 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
1002 | q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); |
1003 | |
1004 | for(i = 0; i < (int)data_len-3; i+=4) { |
1005 | __m256i summ, mull; |
1006 | summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); |
1007 | mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); |
1008 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
1009 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
1010 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
1011 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
1012 | } |
1013 | } |
1014 | else { /* order == 3 */ |
1015 | __m256i q0, q1, q2; |
1016 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
1017 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
1018 | q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); |
1019 | |
1020 | for(i = 0; i < (int)data_len-3; i+=4) { |
1021 | __m256i summ, mull; |
1022 | summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); |
1023 | mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); |
1024 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
1025 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
1026 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
1027 | } |
1028 | } |
1029 | } |
1030 | else { |
1031 | if(order == 2) { |
1032 | __m256i q0, q1; |
1033 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
1034 | q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); |
1035 | |
1036 | for(i = 0; i < (int)data_len-3; i+=4) { |
1037 | __m256i summ, mull; |
1038 | summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); |
1039 | mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); |
1040 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
1041 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
1042 | } |
1043 | } |
1044 | else { /* order == 1 */ |
1045 | __m256i q0; |
1046 | q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); |
1047 | |
1048 | for(i = 0; i < (int)data_len-3; i+=4) { |
1049 | __m256i summ; |
1050 | summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); |
1051 | summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); |
1052 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); |
1053 | } |
1054 | } |
1055 | } |
1056 | } |
1057 | for(; i < (int)data_len; i++) { |
1058 | sum = 0; |
1059 | switch(order) { |
1060 | case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; |
1061 | case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; |
1062 | case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; |
1063 | case 9: sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; |
1064 | case 8: sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; |
1065 | case 7: sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; |
1066 | case 6: sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; |
1067 | case 5: sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; |
1068 | case 4: sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; |
1069 | case 3: sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; |
1070 | case 2: sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; |
1071 | case 1: sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; |
1072 | } |
1073 | residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); |
1074 | } |
1075 | } |
1076 | else { /* order > 12 */ |
1077 | for(i = 0; i < (int)data_len; i++) { |
1078 | sum = 0; |
1079 | switch(order) { |
1080 | case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; |
1081 | case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; |
1082 | case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; |
1083 | case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; |
1084 | case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; |
1085 | case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; |
1086 | case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; |
1087 | case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; |
1088 | case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; |
1089 | case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; |
1090 | case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; |
1091 | case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; |
1092 | case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; |
1093 | case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; |
1094 | case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; |
1095 | case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; |
1096 | case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; |
1097 | case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; |
1098 | case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; |
1099 | case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13]; |
1100 | sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; |
1101 | sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; |
1102 | sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; |
1103 | sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; |
1104 | sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; |
1105 | sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; |
1106 | sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; |
1107 | sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; |
1108 | sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; |
1109 | sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; |
1110 | sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; |
1111 | sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; |
1112 | } |
1113 | residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); |
1114 | } |
1115 | } |
1116 | _mm256_zeroupper(); |
1117 | } |
1118 | |
1119 | #endif /* FLAC__AVX2_SUPPORTED */ |
1120 | #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ |
1121 | #endif /* FLAC__NO_ASM */ |
1122 | #endif /* FLAC__INTEGER_ONLY_LIBRARY */ |