ce188d4d |
1 | /* libFLAC - Free Lossless Audio Codec library |
2 | * Copyright (C) 2000-2009 Josh Coalson |
3 | * Copyright (C) 2011-2016 Xiph.Org Foundation |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * |
9 | * - Redistributions of source code must retain the above copyright |
10 | * notice, this list of conditions and the following disclaimer. |
11 | * |
12 | * - Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * - Neither the name of the Xiph.org Foundation nor the names of its |
17 | * contributors may be used to endorse or promote products derived from |
18 | * this software without specific prior written permission. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
23 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
24 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
25 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
26 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
27 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
28 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
29 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
30 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
31 | */ |
32 | |
33 | #ifdef HAVE_CONFIG_H |
34 | # include <config.h> |
35 | #endif |
36 | |
37 | #include "private/cpu.h" |
38 | |
39 | #ifndef FLAC__INTEGER_ONLY_LIBRARY |
40 | #ifndef FLAC__NO_ASM |
41 | #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN |
42 | #include "private/lpc.h" |
43 | #ifdef FLAC__SSE2_SUPPORTED |
44 | |
45 | #include "FLAC/assert.h" |
46 | #include "FLAC/format.h" |
47 | |
48 | #include <emmintrin.h> /* SSE2 */ |
49 | |
50 | #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); |
51 | #define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr; |
52 | |
53 | #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); |
54 | #define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); |
55 | |
56 | FLAC__SSE_TARGET("sse2") |
57 | void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
58 | { |
59 | int i; |
60 | FLAC__int32 sum; |
61 | __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
62 | |
63 | FLAC__ASSERT(order > 0); |
64 | FLAC__ASSERT(order <= 32); |
65 | |
66 | if(order <= 12) { |
67 | if(order > 8) { |
68 | if(order > 10) { |
69 | if(order == 12) { |
70 | __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; |
71 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
72 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
73 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
74 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
75 | q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
76 | q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
77 | q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
78 | q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
79 | q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
80 | q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
81 | q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); |
82 | q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0)); |
83 | |
84 | for(i = 0; i < (int)data_len-3; i+=4) { |
85 | __m128i summ, mull; |
86 | summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12))); |
87 | mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull); |
88 | mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); |
89 | mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
90 | mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
91 | mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
92 | mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
93 | mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
94 | mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
95 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
96 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
97 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
98 | summ = _mm_sra_epi32(summ, cnt); |
99 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
100 | } |
101 | } |
102 | else { /* order == 11 */ |
103 | __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; |
104 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
105 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
106 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
107 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
108 | q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
109 | q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
110 | q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
111 | q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
112 | q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
113 | q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
114 | q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); |
115 | |
116 | for(i = 0; i < (int)data_len-3; i+=4) { |
117 | __m128i summ, mull; |
118 | summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); |
119 | mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); |
120 | mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
121 | mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
122 | mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
123 | mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
124 | mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
125 | mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
126 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
127 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
128 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
129 | summ = _mm_sra_epi32(summ, cnt); |
130 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
131 | } |
132 | } |
133 | } |
134 | else { |
135 | if(order == 10) { |
136 | __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; |
137 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
138 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
139 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
140 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
141 | q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
142 | q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
143 | q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
144 | q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
145 | q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
146 | q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
147 | |
148 | for(i = 0; i < (int)data_len-3; i+=4) { |
149 | __m128i summ, mull; |
150 | summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); |
151 | mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
152 | mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
153 | mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
154 | mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
155 | mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
156 | mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
157 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
158 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
159 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
160 | summ = _mm_sra_epi32(summ, cnt); |
161 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
162 | } |
163 | } |
164 | else { /* order == 9 */ |
165 | __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8; |
166 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
167 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
168 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
169 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
170 | q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
171 | q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
172 | q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
173 | q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
174 | q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
175 | |
176 | for(i = 0; i < (int)data_len-3; i+=4) { |
177 | __m128i summ, mull; |
178 | summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); |
179 | mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
180 | mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
181 | mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
182 | mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
183 | mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
184 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
185 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
186 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
187 | summ = _mm_sra_epi32(summ, cnt); |
188 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
189 | } |
190 | } |
191 | } |
192 | } |
193 | else if(order > 4) { |
194 | if(order > 6) { |
195 | if(order == 8) { |
196 | __m128i q0, q1, q2, q3, q4, q5, q6, q7; |
197 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
198 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
199 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
200 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
201 | q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
202 | q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
203 | q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
204 | q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
205 | |
206 | for(i = 0; i < (int)data_len-3; i+=4) { |
207 | __m128i summ, mull; |
208 | summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); |
209 | mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
210 | mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
211 | mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
212 | mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
213 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
214 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
215 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
216 | summ = _mm_sra_epi32(summ, cnt); |
217 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
218 | } |
219 | } |
220 | else { /* order == 7 */ |
221 | __m128i q0, q1, q2, q3, q4, q5, q6; |
222 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
223 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
224 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
225 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
226 | q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
227 | q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
228 | q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
229 | |
230 | for(i = 0; i < (int)data_len-3; i+=4) { |
231 | __m128i summ, mull; |
232 | summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); |
233 | mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
234 | mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
235 | mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
236 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
237 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
238 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
239 | summ = _mm_sra_epi32(summ, cnt); |
240 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
241 | } |
242 | } |
243 | } |
244 | else { |
245 | if(order == 6) { |
246 | __m128i q0, q1, q2, q3, q4, q5; |
247 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
248 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
249 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
250 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
251 | q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
252 | q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
253 | |
254 | for(i = 0; i < (int)data_len-3; i+=4) { |
255 | __m128i summ, mull; |
256 | summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); |
257 | mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
258 | mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
259 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
260 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
261 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
262 | summ = _mm_sra_epi32(summ, cnt); |
263 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
264 | } |
265 | } |
266 | else { /* order == 5 */ |
267 | __m128i q0, q1, q2, q3, q4; |
268 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
269 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
270 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
271 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
272 | q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
273 | |
274 | for(i = 0; i < (int)data_len-3; i+=4) { |
275 | __m128i summ, mull; |
276 | summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); |
277 | mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
278 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
279 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
280 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
281 | summ = _mm_sra_epi32(summ, cnt); |
282 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
283 | } |
284 | } |
285 | } |
286 | } |
287 | else { |
288 | if(order > 2) { |
289 | if(order == 4) { |
290 | __m128i q0, q1, q2, q3; |
291 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
292 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
293 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
294 | q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
295 | |
296 | for(i = 0; i < (int)data_len-3; i+=4) { |
297 | __m128i summ, mull; |
298 | summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); |
299 | mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
300 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
301 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
302 | summ = _mm_sra_epi32(summ, cnt); |
303 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
304 | } |
305 | } |
306 | else { /* order == 3 */ |
307 | __m128i q0, q1, q2; |
308 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
309 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
310 | q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
311 | |
312 | for(i = 0; i < (int)data_len-3; i+=4) { |
313 | __m128i summ, mull; |
314 | summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); |
315 | mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
316 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
317 | summ = _mm_sra_epi32(summ, cnt); |
318 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
319 | } |
320 | } |
321 | } |
322 | else { |
323 | if(order == 2) { |
324 | __m128i q0, q1; |
325 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
326 | q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
327 | |
328 | for(i = 0; i < (int)data_len-3; i+=4) { |
329 | __m128i summ, mull; |
330 | summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); |
331 | mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
332 | summ = _mm_sra_epi32(summ, cnt); |
333 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
334 | } |
335 | } |
336 | else { /* order == 1 */ |
337 | __m128i q0; |
338 | q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
339 | |
340 | for(i = 0; i < (int)data_len-3; i+=4) { |
341 | __m128i summ; |
342 | summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); |
343 | summ = _mm_sra_epi32(summ, cnt); |
344 | _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
345 | } |
346 | } |
347 | } |
348 | } |
349 | for(; i < (int)data_len; i++) { |
350 | sum = 0; |
351 | switch(order) { |
352 | case 12: sum += qlp_coeff[11] * data[i-12]; |
353 | case 11: sum += qlp_coeff[10] * data[i-11]; |
354 | case 10: sum += qlp_coeff[ 9] * data[i-10]; |
355 | case 9: sum += qlp_coeff[ 8] * data[i- 9]; |
356 | case 8: sum += qlp_coeff[ 7] * data[i- 8]; |
357 | case 7: sum += qlp_coeff[ 6] * data[i- 7]; |
358 | case 6: sum += qlp_coeff[ 5] * data[i- 6]; |
359 | case 5: sum += qlp_coeff[ 4] * data[i- 5]; |
360 | case 4: sum += qlp_coeff[ 3] * data[i- 4]; |
361 | case 3: sum += qlp_coeff[ 2] * data[i- 3]; |
362 | case 2: sum += qlp_coeff[ 1] * data[i- 2]; |
363 | case 1: sum += qlp_coeff[ 0] * data[i- 1]; |
364 | } |
365 | residual[i] = data[i] - (sum >> lp_quantization); |
366 | } |
367 | } |
368 | else { /* order > 12 */ |
369 | for(i = 0; i < (int)data_len; i++) { |
370 | sum = 0; |
371 | switch(order) { |
372 | case 32: sum += qlp_coeff[31] * data[i-32]; |
373 | case 31: sum += qlp_coeff[30] * data[i-31]; |
374 | case 30: sum += qlp_coeff[29] * data[i-30]; |
375 | case 29: sum += qlp_coeff[28] * data[i-29]; |
376 | case 28: sum += qlp_coeff[27] * data[i-28]; |
377 | case 27: sum += qlp_coeff[26] * data[i-27]; |
378 | case 26: sum += qlp_coeff[25] * data[i-26]; |
379 | case 25: sum += qlp_coeff[24] * data[i-25]; |
380 | case 24: sum += qlp_coeff[23] * data[i-24]; |
381 | case 23: sum += qlp_coeff[22] * data[i-23]; |
382 | case 22: sum += qlp_coeff[21] * data[i-22]; |
383 | case 21: sum += qlp_coeff[20] * data[i-21]; |
384 | case 20: sum += qlp_coeff[19] * data[i-20]; |
385 | case 19: sum += qlp_coeff[18] * data[i-19]; |
386 | case 18: sum += qlp_coeff[17] * data[i-18]; |
387 | case 17: sum += qlp_coeff[16] * data[i-17]; |
388 | case 16: sum += qlp_coeff[15] * data[i-16]; |
389 | case 15: sum += qlp_coeff[14] * data[i-15]; |
390 | case 14: sum += qlp_coeff[13] * data[i-14]; |
391 | case 13: sum += qlp_coeff[12] * data[i-13]; |
392 | sum += qlp_coeff[11] * data[i-12]; |
393 | sum += qlp_coeff[10] * data[i-11]; |
394 | sum += qlp_coeff[ 9] * data[i-10]; |
395 | sum += qlp_coeff[ 8] * data[i- 9]; |
396 | sum += qlp_coeff[ 7] * data[i- 8]; |
397 | sum += qlp_coeff[ 6] * data[i- 7]; |
398 | sum += qlp_coeff[ 5] * data[i- 6]; |
399 | sum += qlp_coeff[ 4] * data[i- 5]; |
400 | sum += qlp_coeff[ 3] * data[i- 4]; |
401 | sum += qlp_coeff[ 2] * data[i- 3]; |
402 | sum += qlp_coeff[ 1] * data[i- 2]; |
403 | sum += qlp_coeff[ 0] * data[i- 1]; |
404 | } |
405 | residual[i] = data[i] - (sum >> lp_quantization); |
406 | } |
407 | } |
408 | } |
409 | |
410 | FLAC__SSE_TARGET("sse2") |
411 | void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
412 | { |
413 | int i; |
414 | |
415 | FLAC__ASSERT(order > 0); |
416 | FLAC__ASSERT(order <= 32); |
417 | |
418 | if(order <= 12) { |
419 | if(order > 8) { /* order == 9, 10, 11, 12 */ |
420 | if(order > 10) { /* order == 11, 12 */ |
421 | if(order == 12) { |
422 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; |
423 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0] |
424 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2] |
425 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4] |
426 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6] |
427 | xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8] |
428 | xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10] |
429 | |
430 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0] |
431 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2] |
432 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4] |
433 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6] |
434 | xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8] |
435 | xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10] |
436 | |
437 | for(i = 0; i < (int)data_len; i++) { |
438 | //sum = 0; |
439 | //sum += qlp_coeff[11] * data[i-12]; |
440 | //sum += qlp_coeff[10] * data[i-11]; |
441 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12] |
442 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] |
443 | xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */ |
444 | |
445 | //sum += qlp_coeff[9] * data[i-10]; |
446 | //sum += qlp_coeff[8] * data[i-9]; |
447 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); |
448 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
449 | xmm6 = _mm_mul_epu32(xmm6, xmm4); |
450 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
451 | |
452 | //sum += qlp_coeff[7] * data[i-8]; |
453 | //sum += qlp_coeff[6] * data[i-7]; |
454 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
455 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
456 | xmm6 = _mm_mul_epu32(xmm6, xmm3); |
457 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
458 | |
459 | //sum += qlp_coeff[5] * data[i-6]; |
460 | //sum += qlp_coeff[4] * data[i-5]; |
461 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
462 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
463 | xmm6 = _mm_mul_epu32(xmm6, xmm2); |
464 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
465 | |
466 | //sum += qlp_coeff[3] * data[i-4]; |
467 | //sum += qlp_coeff[2] * data[i-3]; |
468 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
469 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
470 | xmm6 = _mm_mul_epu32(xmm6, xmm1); |
471 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
472 | |
473 | //sum += qlp_coeff[1] * data[i-2]; |
474 | //sum += qlp_coeff[0] * data[i-1]; |
475 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
476 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
477 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
478 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
479 | |
480 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
481 | RESIDUAL32_RESULT(xmm7); |
482 | } |
483 | } |
484 | else { /* order == 11 */ |
485 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; |
486 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
487 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
488 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
489 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
490 | xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); |
491 | xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]); |
492 | |
493 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
494 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
495 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
496 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); |
497 | xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); |
498 | |
499 | for(i = 0; i < (int)data_len; i++) { |
500 | //sum = 0; |
501 | //sum = qlp_coeff[10] * data[i-11]; |
502 | xmm7 = _mm_cvtsi32_si128(data[i-11]); |
503 | xmm7 = _mm_mul_epu32(xmm7, xmm5); |
504 | |
505 | //sum += qlp_coeff[9] * data[i-10]; |
506 | //sum += qlp_coeff[8] * data[i-9]; |
507 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); |
508 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
509 | xmm6 = _mm_mul_epu32(xmm6, xmm4); |
510 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
511 | |
512 | //sum += qlp_coeff[7] * data[i-8]; |
513 | //sum += qlp_coeff[6] * data[i-7]; |
514 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
515 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
516 | xmm6 = _mm_mul_epu32(xmm6, xmm3); |
517 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
518 | |
519 | //sum += qlp_coeff[5] * data[i-6]; |
520 | //sum += qlp_coeff[4] * data[i-5]; |
521 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
522 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
523 | xmm6 = _mm_mul_epu32(xmm6, xmm2); |
524 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
525 | |
526 | //sum += qlp_coeff[3] * data[i-4]; |
527 | //sum += qlp_coeff[2] * data[i-3]; |
528 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
529 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
530 | xmm6 = _mm_mul_epu32(xmm6, xmm1); |
531 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
532 | |
533 | //sum += qlp_coeff[1] * data[i-2]; |
534 | //sum += qlp_coeff[0] * data[i-1]; |
535 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
536 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
537 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
538 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
539 | |
540 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
541 | RESIDUAL32_RESULT(xmm7); |
542 | } |
543 | } |
544 | } |
545 | else { /* order == 9, 10 */ |
546 | if(order == 10) { |
547 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; |
548 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
549 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
550 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
551 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
552 | xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); |
553 | |
554 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
555 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
556 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
557 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); |
558 | xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); |
559 | |
560 | for(i = 0; i < (int)data_len; i++) { |
561 | //sum = 0; |
562 | //sum += qlp_coeff[9] * data[i-10]; |
563 | //sum += qlp_coeff[8] * data[i-9]; |
564 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10)); |
565 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
566 | xmm7 = _mm_mul_epu32(xmm7, xmm4); |
567 | |
568 | //sum += qlp_coeff[7] * data[i-8]; |
569 | //sum += qlp_coeff[6] * data[i-7]; |
570 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
571 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
572 | xmm6 = _mm_mul_epu32(xmm6, xmm3); |
573 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
574 | |
575 | //sum += qlp_coeff[5] * data[i-6]; |
576 | //sum += qlp_coeff[4] * data[i-5]; |
577 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
578 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
579 | xmm6 = _mm_mul_epu32(xmm6, xmm2); |
580 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
581 | |
582 | //sum += qlp_coeff[3] * data[i-4]; |
583 | //sum += qlp_coeff[2] * data[i-3]; |
584 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
585 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
586 | xmm6 = _mm_mul_epu32(xmm6, xmm1); |
587 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
588 | |
589 | //sum += qlp_coeff[1] * data[i-2]; |
590 | //sum += qlp_coeff[0] * data[i-1]; |
591 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
592 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
593 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
594 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
595 | |
596 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
597 | RESIDUAL32_RESULT(xmm7); |
598 | } |
599 | } |
600 | else { /* order == 9 */ |
601 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; |
602 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
603 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
604 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
605 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
606 | xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]); |
607 | |
608 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
609 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
610 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
611 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); |
612 | |
613 | for(i = 0; i < (int)data_len; i++) { |
614 | //sum = 0; |
615 | //sum = qlp_coeff[8] * data[i-9]; |
616 | xmm7 = _mm_cvtsi32_si128(data[i-9]); |
617 | xmm7 = _mm_mul_epu32(xmm7, xmm4); |
618 | |
619 | //sum += qlp_coeff[7] * data[i-8]; |
620 | //sum += qlp_coeff[6] * data[i-7]; |
621 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
622 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
623 | xmm6 = _mm_mul_epu32(xmm6, xmm3); |
624 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
625 | |
626 | //sum += qlp_coeff[5] * data[i-6]; |
627 | //sum += qlp_coeff[4] * data[i-5]; |
628 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
629 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
630 | xmm6 = _mm_mul_epu32(xmm6, xmm2); |
631 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
632 | |
633 | //sum += qlp_coeff[3] * data[i-4]; |
634 | //sum += qlp_coeff[2] * data[i-3]; |
635 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
636 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
637 | xmm6 = _mm_mul_epu32(xmm6, xmm1); |
638 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
639 | |
640 | //sum += qlp_coeff[1] * data[i-2]; |
641 | //sum += qlp_coeff[0] * data[i-1]; |
642 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
643 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
644 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
645 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
646 | |
647 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
648 | RESIDUAL32_RESULT(xmm7); |
649 | } |
650 | } |
651 | } |
652 | } |
653 | else if(order > 4) { /* order == 5, 6, 7, 8 */ |
654 | if(order > 6) { /* order == 7, 8 */ |
655 | if(order == 8) { |
656 | __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; |
657 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
658 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
659 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
660 | xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); |
661 | |
662 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
663 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
664 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
665 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); |
666 | |
667 | for(i = 0; i < (int)data_len; i++) { |
668 | //sum = 0; |
669 | //sum += qlp_coeff[7] * data[i-8]; |
670 | //sum += qlp_coeff[6] * data[i-7]; |
671 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8)); |
672 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
673 | xmm7 = _mm_mul_epu32(xmm7, xmm3); |
674 | |
675 | //sum += qlp_coeff[5] * data[i-6]; |
676 | //sum += qlp_coeff[4] * data[i-5]; |
677 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
678 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
679 | xmm6 = _mm_mul_epu32(xmm6, xmm2); |
680 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
681 | |
682 | //sum += qlp_coeff[3] * data[i-4]; |
683 | //sum += qlp_coeff[2] * data[i-3]; |
684 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
685 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
686 | xmm6 = _mm_mul_epu32(xmm6, xmm1); |
687 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
688 | |
689 | //sum += qlp_coeff[1] * data[i-2]; |
690 | //sum += qlp_coeff[0] * data[i-1]; |
691 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
692 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
693 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
694 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
695 | |
696 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
697 | RESIDUAL32_RESULT(xmm7); |
698 | } |
699 | } |
700 | else { /* order == 7 */ |
701 | __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; |
702 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
703 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
704 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
705 | xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]); |
706 | |
707 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
708 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
709 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
710 | |
711 | for(i = 0; i < (int)data_len; i++) { |
712 | //sum = 0; |
713 | //sum = qlp_coeff[6] * data[i-7]; |
714 | xmm7 = _mm_cvtsi32_si128(data[i-7]); |
715 | xmm7 = _mm_mul_epu32(xmm7, xmm3); |
716 | |
717 | //sum += qlp_coeff[5] * data[i-6]; |
718 | //sum += qlp_coeff[4] * data[i-5]; |
719 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
720 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
721 | xmm6 = _mm_mul_epu32(xmm6, xmm2); |
722 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
723 | |
724 | //sum += qlp_coeff[3] * data[i-4]; |
725 | //sum += qlp_coeff[2] * data[i-3]; |
726 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
727 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
728 | xmm6 = _mm_mul_epu32(xmm6, xmm1); |
729 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
730 | |
731 | //sum += qlp_coeff[1] * data[i-2]; |
732 | //sum += qlp_coeff[0] * data[i-1]; |
733 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
734 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
735 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
736 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
737 | |
738 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
739 | RESIDUAL32_RESULT(xmm7); |
740 | } |
741 | } |
742 | } |
743 | else { /* order == 5, 6 */ |
744 | if(order == 6) { |
745 | __m128i xmm0, xmm1, xmm2, xmm6, xmm7; |
746 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
747 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
748 | xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); |
749 | |
750 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
751 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
752 | xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); |
753 | |
754 | for(i = 0; i < (int)data_len; i++) { |
755 | //sum = 0; |
756 | //sum += qlp_coeff[5] * data[i-6]; |
757 | //sum += qlp_coeff[4] * data[i-5]; |
758 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6)); |
759 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
760 | xmm7 = _mm_mul_epu32(xmm7, xmm2); |
761 | |
762 | //sum += qlp_coeff[3] * data[i-4]; |
763 | //sum += qlp_coeff[2] * data[i-3]; |
764 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
765 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
766 | xmm6 = _mm_mul_epu32(xmm6, xmm1); |
767 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
768 | |
769 | //sum += qlp_coeff[1] * data[i-2]; |
770 | //sum += qlp_coeff[0] * data[i-1]; |
771 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
772 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
773 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
774 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
775 | |
776 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
777 | RESIDUAL32_RESULT(xmm7); |
778 | } |
779 | } |
780 | else { /* order == 5 */ |
781 | __m128i xmm0, xmm1, xmm2, xmm6, xmm7; |
782 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
783 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
784 | xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]); |
785 | |
786 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
787 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
788 | |
789 | for(i = 0; i < (int)data_len; i++) { |
790 | //sum = 0; |
791 | //sum = qlp_coeff[4] * data[i-5]; |
792 | xmm7 = _mm_cvtsi32_si128(data[i-5]); |
793 | xmm7 = _mm_mul_epu32(xmm7, xmm2); |
794 | |
795 | //sum += qlp_coeff[3] * data[i-4]; |
796 | //sum += qlp_coeff[2] * data[i-3]; |
797 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
798 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
799 | xmm6 = _mm_mul_epu32(xmm6, xmm1); |
800 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
801 | |
802 | //sum += qlp_coeff[1] * data[i-2]; |
803 | //sum += qlp_coeff[0] * data[i-1]; |
804 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
805 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
806 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
807 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
808 | |
809 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
810 | RESIDUAL32_RESULT(xmm7); |
811 | } |
812 | } |
813 | } |
814 | } |
815 | else { /* order == 1, 2, 3, 4 */ |
816 | if(order > 2) { /* order == 3, 4 */ |
817 | if(order == 4) { |
818 | __m128i xmm0, xmm1, xmm6, xmm7; |
819 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
820 | xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); |
821 | |
822 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
823 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); |
824 | |
825 | for(i = 0; i < (int)data_len; i++) { |
826 | //sum = 0; |
827 | //sum += qlp_coeff[3] * data[i-4]; |
828 | //sum += qlp_coeff[2] * data[i-3]; |
829 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4)); |
830 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
831 | xmm7 = _mm_mul_epu32(xmm7, xmm1); |
832 | |
833 | //sum += qlp_coeff[1] * data[i-2]; |
834 | //sum += qlp_coeff[0] * data[i-1]; |
835 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
836 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
837 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
838 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
839 | |
840 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
841 | RESIDUAL32_RESULT(xmm7); |
842 | } |
843 | } |
844 | else { /* order == 3 */ |
845 | __m128i xmm0, xmm1, xmm6, xmm7; |
846 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
847 | xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]); |
848 | |
849 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
850 | |
851 | for(i = 0; i < (int)data_len; i++) { |
852 | //sum = 0; |
853 | //sum = qlp_coeff[2] * data[i-3]; |
854 | xmm7 = _mm_cvtsi32_si128(data[i-3]); |
855 | xmm7 = _mm_mul_epu32(xmm7, xmm1); |
856 | |
857 | //sum += qlp_coeff[1] * data[i-2]; |
858 | //sum += qlp_coeff[0] * data[i-1]; |
859 | xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
860 | xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); |
861 | xmm6 = _mm_mul_epu32(xmm6, xmm0); |
862 | xmm7 = _mm_add_epi32(xmm7, xmm6); |
863 | |
864 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
865 | RESIDUAL32_RESULT(xmm7); |
866 | } |
867 | } |
868 | } |
869 | else { /* order == 1, 2 */ |
870 | if(order == 2) { |
871 | __m128i xmm0, xmm7; |
872 | xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); |
873 | xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); |
874 | |
875 | for(i = 0; i < (int)data_len; i++) { |
876 | //sum = 0; |
877 | //sum += qlp_coeff[1] * data[i-2]; |
878 | //sum += qlp_coeff[0] * data[i-1]; |
879 | xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2)); |
880 | xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); |
881 | xmm7 = _mm_mul_epu32(xmm7, xmm0); |
882 | |
883 | xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); |
884 | RESIDUAL32_RESULT(xmm7); |
885 | } |
886 | } |
887 | else { /* order == 1 */ |
888 | for(i = 0; i < (int)data_len; i++) |
889 | residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization); |
890 | } |
891 | } |
892 | } |
893 | } |
894 | else { /* order > 12 */ |
895 | FLAC__int32 sum; |
896 | for(i = 0; i < (int)data_len; i++) { |
897 | sum = 0; |
898 | switch(order) { |
899 | case 32: sum += qlp_coeff[31] * data[i-32]; |
900 | case 31: sum += qlp_coeff[30] * data[i-31]; |
901 | case 30: sum += qlp_coeff[29] * data[i-30]; |
902 | case 29: sum += qlp_coeff[28] * data[i-29]; |
903 | case 28: sum += qlp_coeff[27] * data[i-28]; |
904 | case 27: sum += qlp_coeff[26] * data[i-27]; |
905 | case 26: sum += qlp_coeff[25] * data[i-26]; |
906 | case 25: sum += qlp_coeff[24] * data[i-25]; |
907 | case 24: sum += qlp_coeff[23] * data[i-24]; |
908 | case 23: sum += qlp_coeff[22] * data[i-23]; |
909 | case 22: sum += qlp_coeff[21] * data[i-22]; |
910 | case 21: sum += qlp_coeff[20] * data[i-21]; |
911 | case 20: sum += qlp_coeff[19] * data[i-20]; |
912 | case 19: sum += qlp_coeff[18] * data[i-19]; |
913 | case 18: sum += qlp_coeff[17] * data[i-18]; |
914 | case 17: sum += qlp_coeff[16] * data[i-17]; |
915 | case 16: sum += qlp_coeff[15] * data[i-16]; |
916 | case 15: sum += qlp_coeff[14] * data[i-15]; |
917 | case 14: sum += qlp_coeff[13] * data[i-14]; |
918 | case 13: sum += qlp_coeff[12] * data[i-13]; |
919 | sum += qlp_coeff[11] * data[i-12]; |
920 | sum += qlp_coeff[10] * data[i-11]; |
921 | sum += qlp_coeff[ 9] * data[i-10]; |
922 | sum += qlp_coeff[ 8] * data[i- 9]; |
923 | sum += qlp_coeff[ 7] * data[i- 8]; |
924 | sum += qlp_coeff[ 6] * data[i- 7]; |
925 | sum += qlp_coeff[ 5] * data[i- 6]; |
926 | sum += qlp_coeff[ 4] * data[i- 5]; |
927 | sum += qlp_coeff[ 3] * data[i- 4]; |
928 | sum += qlp_coeff[ 2] * data[i- 3]; |
929 | sum += qlp_coeff[ 1] * data[i- 2]; |
930 | sum += qlp_coeff[ 0] * data[i- 1]; |
931 | } |
932 | residual[i] = data[i] - (sum >> lp_quantization); |
933 | } |
934 | } |
935 | } |
936 | |
937 | #if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */ |
938 | |
939 | FLAC__SSE_TARGET("sse2") |
940 | void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) |
941 | { |
942 | if (order < 8 || order > 12) { |
943 | FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data); |
944 | return; |
945 | } |
946 | if (data_len == 0) |
947 | return; |
948 | |
949 | FLAC__ASSERT(order >= 8); |
950 | FLAC__ASSERT(order <= 12); |
951 | |
952 | if(order > 8) { /* order == 9, 10, 11, 12 */ |
953 | FLAC__int32 curr; |
954 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; |
955 | xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); |
956 | xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); |
957 | xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */ |
958 | switch(order) /* ...and zero them out */ |
959 | { |
960 | case 9: |
961 | xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break; |
962 | case 10: |
963 | xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break; |
964 | case 11: |
965 | xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break; |
966 | } |
967 | xmm2 = _mm_setzero_si128(); |
968 | xmm0 = _mm_packs_epi32(xmm0, xmm6); |
969 | xmm1 = _mm_packs_epi32(xmm1, xmm2); |
970 | |
971 | xmm4 = _mm_loadu_si128((const __m128i*)(data-12)); |
972 | xmm5 = _mm_loadu_si128((const __m128i*)(data-8)); |
973 | xmm3 = _mm_loadu_si128((const __m128i*)(data-4)); |
974 | xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3)); |
975 | xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3)); |
976 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3)); |
977 | xmm4 = _mm_packs_epi32(xmm4, xmm2); |
978 | xmm3 = _mm_packs_epi32(xmm3, xmm5); |
979 | |
980 | xmm7 = _mm_slli_si128(xmm1, 2); |
981 | xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14)); |
982 | xmm2 = _mm_slli_si128(xmm0, 2); |
983 | |
984 | /* xmm0, xmm1: qlp_coeff |
985 | xmm2, xmm7: qlp_coeff << 16 bit |
986 | xmm3, xmm4: data */ |
987 | |
988 | xmm5 = _mm_madd_epi16(xmm4, xmm1); |
989 | xmm6 = _mm_madd_epi16(xmm3, xmm0); |
990 | xmm6 = _mm_add_epi32(xmm6, xmm5); |
991 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
992 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
993 | |
994 | DATA16_RESULT(xmm6); |
995 | |
996 | data_len--; |
997 | |
998 | if(data_len % 2) { |
999 | xmm6 = _mm_srli_si128(xmm3, 14); |
1000 | xmm4 = _mm_slli_si128(xmm4, 2); |
1001 | xmm3 = _mm_slli_si128(xmm3, 2); |
1002 | xmm4 = _mm_or_si128(xmm4, xmm6); |
1003 | xmm3 = _mm_insert_epi16(xmm3, curr, 0); |
1004 | |
1005 | xmm5 = _mm_madd_epi16(xmm4, xmm1); |
1006 | xmm6 = _mm_madd_epi16(xmm3, xmm0); |
1007 | xmm6 = _mm_add_epi32(xmm6, xmm5); |
1008 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
1009 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
1010 | |
1011 | DATA16_RESULT(xmm6); |
1012 | |
1013 | data_len--; |
1014 | } |
1015 | |
1016 | while(data_len) { /* data_len is a multiple of 2 */ |
1017 | /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */ |
1018 | xmm6 = _mm_srli_si128(xmm3, 12); |
1019 | xmm4 = _mm_slli_si128(xmm4, 4); |
1020 | xmm3 = _mm_slli_si128(xmm3, 4); |
1021 | xmm4 = _mm_or_si128(xmm4, xmm6); |
1022 | xmm3 = _mm_insert_epi16(xmm3, curr, 1); |
1023 | |
1024 | xmm5 = _mm_madd_epi16(xmm4, xmm7); |
1025 | xmm6 = _mm_madd_epi16(xmm3, xmm2); |
1026 | xmm6 = _mm_add_epi32(xmm6, xmm5); |
1027 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
1028 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
1029 | |
1030 | DATA16_RESULT(xmm6); |
1031 | |
1032 | xmm3 = _mm_insert_epi16(xmm3, curr, 0); |
1033 | |
1034 | xmm5 = _mm_madd_epi16(xmm4, xmm1); |
1035 | xmm6 = _mm_madd_epi16(xmm3, xmm0); |
1036 | xmm6 = _mm_add_epi32(xmm6, xmm5); |
1037 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
1038 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
1039 | |
1040 | DATA16_RESULT(xmm6); |
1041 | |
1042 | data_len-=2; |
1043 | } |
1044 | } /* endif(order > 8) */ |
1045 | else |
1046 | { |
1047 | FLAC__int32 curr; |
1048 | __m128i xmm0, xmm1, xmm3, xmm6; |
1049 | xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); |
1050 | xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); |
1051 | xmm0 = _mm_packs_epi32(xmm0, xmm1); |
1052 | |
1053 | xmm1 = _mm_loadu_si128((const __m128i*)(data-8)); |
1054 | xmm3 = _mm_loadu_si128((const __m128i*)(data-4)); |
1055 | xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3)); |
1056 | xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3)); |
1057 | xmm3 = _mm_packs_epi32(xmm3, xmm1); |
1058 | |
1059 | /* xmm0: qlp_coeff |
1060 | xmm3: data */ |
1061 | |
1062 | xmm6 = _mm_madd_epi16(xmm3, xmm0); |
1063 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
1064 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
1065 | |
1066 | DATA16_RESULT(xmm6); |
1067 | |
1068 | data_len--; |
1069 | |
1070 | while(data_len) { |
1071 | xmm3 = _mm_slli_si128(xmm3, 2); |
1072 | xmm3 = _mm_insert_epi16(xmm3, curr, 0); |
1073 | |
1074 | xmm6 = _mm_madd_epi16(xmm3, xmm0); |
1075 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
1076 | xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
1077 | |
1078 | DATA16_RESULT(xmm6); |
1079 | |
1080 | data_len--; |
1081 | } |
1082 | } |
1083 | } |
1084 | |
1085 | #endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */ |
1086 | |
1087 | #endif /* FLAC__SSE2_SUPPORTED */ |
1088 | #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ |
1089 | #endif /* FLAC__NO_ASM */ |
1090 | #endif /* FLAC__INTEGER_ONLY_LIBRARY */ |