HLE: Merge HLE BIOS improvements from upstream
[pcsx_rearmed.git] / deps / flac-1.3.2 / src / libFLAC / lpc_intrin_sse2.c
CommitLineData
ce188d4d 1/* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2016 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#ifdef HAVE_CONFIG_H
34# include <config.h>
35#endif
36
37#include "private/cpu.h"
38
39#ifndef FLAC__INTEGER_ONLY_LIBRARY
40#ifndef FLAC__NO_ASM
41#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42#include "private/lpc.h"
43#ifdef FLAC__SSE2_SUPPORTED
44
45#include "FLAC/assert.h"
46#include "FLAC/format.h"
47
48#include <emmintrin.h> /* SSE2 */
49
50#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51#define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
52
53#define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
54#define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
55
56FLAC__SSE_TARGET("sse2")
57void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
58{
59 int i;
60 FLAC__int32 sum;
61 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
62
63 FLAC__ASSERT(order > 0);
64 FLAC__ASSERT(order <= 32);
65
66 if(order <= 12) {
67 if(order > 8) {
68 if(order > 10) {
69 if(order == 12) {
70 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
71 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
72 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
73 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
74 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
75 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
76 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
77 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
78 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
79 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
80 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
81 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
82 q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
83
84 for(i = 0; i < (int)data_len-3; i+=4) {
85 __m128i summ, mull;
86 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
87 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
88 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
89 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
90 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
91 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
92 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
93 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
94 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
95 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
96 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
97 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
98 summ = _mm_sra_epi32(summ, cnt);
99 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
100 }
101 }
102 else { /* order == 11 */
103 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
104 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
105 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
106 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
107 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
108 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
109 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
110 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
111 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
112 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
113 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
114 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
115
116 for(i = 0; i < (int)data_len-3; i+=4) {
117 __m128i summ, mull;
118 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
119 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
120 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
121 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
122 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
123 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
124 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
125 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
126 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
127 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
128 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
129 summ = _mm_sra_epi32(summ, cnt);
130 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
131 }
132 }
133 }
134 else {
135 if(order == 10) {
136 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
137 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
138 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
139 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
140 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
141 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
142 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
143 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
144 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
145 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
146 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
147
148 for(i = 0; i < (int)data_len-3; i+=4) {
149 __m128i summ, mull;
150 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
151 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
152 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
153 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
154 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
155 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
156 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
157 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
158 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
159 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
160 summ = _mm_sra_epi32(summ, cnt);
161 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
162 }
163 }
164 else { /* order == 9 */
165 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
166 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
167 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
168 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
169 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
170 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
171 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
172 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
173 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
174 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
175
176 for(i = 0; i < (int)data_len-3; i+=4) {
177 __m128i summ, mull;
178 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
179 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
180 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
181 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
182 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
183 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
184 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
185 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
186 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
187 summ = _mm_sra_epi32(summ, cnt);
188 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
189 }
190 }
191 }
192 }
193 else if(order > 4) {
194 if(order > 6) {
195 if(order == 8) {
196 __m128i q0, q1, q2, q3, q4, q5, q6, q7;
197 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
198 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
199 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
200 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
201 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
202 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
203 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
204 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
205
206 for(i = 0; i < (int)data_len-3; i+=4) {
207 __m128i summ, mull;
208 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
209 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
210 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
211 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
212 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
213 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
214 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
215 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
216 summ = _mm_sra_epi32(summ, cnt);
217 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
218 }
219 }
220 else { /* order == 7 */
221 __m128i q0, q1, q2, q3, q4, q5, q6;
222 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
223 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
224 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
225 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
226 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
227 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
228 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
229
230 for(i = 0; i < (int)data_len-3; i+=4) {
231 __m128i summ, mull;
232 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
233 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
234 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
235 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
236 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
237 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
238 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
239 summ = _mm_sra_epi32(summ, cnt);
240 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
241 }
242 }
243 }
244 else {
245 if(order == 6) {
246 __m128i q0, q1, q2, q3, q4, q5;
247 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
248 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
249 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
250 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
251 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
252 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
253
254 for(i = 0; i < (int)data_len-3; i+=4) {
255 __m128i summ, mull;
256 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
257 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
258 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
259 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
260 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
261 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
262 summ = _mm_sra_epi32(summ, cnt);
263 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
264 }
265 }
266 else { /* order == 5 */
267 __m128i q0, q1, q2, q3, q4;
268 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
269 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
270 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
271 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
272 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
273
274 for(i = 0; i < (int)data_len-3; i+=4) {
275 __m128i summ, mull;
276 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
277 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
278 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
279 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
280 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
281 summ = _mm_sra_epi32(summ, cnt);
282 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
283 }
284 }
285 }
286 }
287 else {
288 if(order > 2) {
289 if(order == 4) {
290 __m128i q0, q1, q2, q3;
291 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
292 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
293 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
294 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
295
296 for(i = 0; i < (int)data_len-3; i+=4) {
297 __m128i summ, mull;
298 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
299 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
300 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
301 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
302 summ = _mm_sra_epi32(summ, cnt);
303 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
304 }
305 }
306 else { /* order == 3 */
307 __m128i q0, q1, q2;
308 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
309 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
310 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
311
312 for(i = 0; i < (int)data_len-3; i+=4) {
313 __m128i summ, mull;
314 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
315 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
316 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
317 summ = _mm_sra_epi32(summ, cnt);
318 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
319 }
320 }
321 }
322 else {
323 if(order == 2) {
324 __m128i q0, q1;
325 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
326 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
327
328 for(i = 0; i < (int)data_len-3; i+=4) {
329 __m128i summ, mull;
330 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
331 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
332 summ = _mm_sra_epi32(summ, cnt);
333 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
334 }
335 }
336 else { /* order == 1 */
337 __m128i q0;
338 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
339
340 for(i = 0; i < (int)data_len-3; i+=4) {
341 __m128i summ;
342 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
343 summ = _mm_sra_epi32(summ, cnt);
344 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
345 }
346 }
347 }
348 }
349 for(; i < (int)data_len; i++) {
350 sum = 0;
351 switch(order) {
352 case 12: sum += qlp_coeff[11] * data[i-12];
353 case 11: sum += qlp_coeff[10] * data[i-11];
354 case 10: sum += qlp_coeff[ 9] * data[i-10];
355 case 9: sum += qlp_coeff[ 8] * data[i- 9];
356 case 8: sum += qlp_coeff[ 7] * data[i- 8];
357 case 7: sum += qlp_coeff[ 6] * data[i- 7];
358 case 6: sum += qlp_coeff[ 5] * data[i- 6];
359 case 5: sum += qlp_coeff[ 4] * data[i- 5];
360 case 4: sum += qlp_coeff[ 3] * data[i- 4];
361 case 3: sum += qlp_coeff[ 2] * data[i- 3];
362 case 2: sum += qlp_coeff[ 1] * data[i- 2];
363 case 1: sum += qlp_coeff[ 0] * data[i- 1];
364 }
365 residual[i] = data[i] - (sum >> lp_quantization);
366 }
367 }
368 else { /* order > 12 */
369 for(i = 0; i < (int)data_len; i++) {
370 sum = 0;
371 switch(order) {
372 case 32: sum += qlp_coeff[31] * data[i-32];
373 case 31: sum += qlp_coeff[30] * data[i-31];
374 case 30: sum += qlp_coeff[29] * data[i-30];
375 case 29: sum += qlp_coeff[28] * data[i-29];
376 case 28: sum += qlp_coeff[27] * data[i-28];
377 case 27: sum += qlp_coeff[26] * data[i-27];
378 case 26: sum += qlp_coeff[25] * data[i-26];
379 case 25: sum += qlp_coeff[24] * data[i-25];
380 case 24: sum += qlp_coeff[23] * data[i-24];
381 case 23: sum += qlp_coeff[22] * data[i-23];
382 case 22: sum += qlp_coeff[21] * data[i-22];
383 case 21: sum += qlp_coeff[20] * data[i-21];
384 case 20: sum += qlp_coeff[19] * data[i-20];
385 case 19: sum += qlp_coeff[18] * data[i-19];
386 case 18: sum += qlp_coeff[17] * data[i-18];
387 case 17: sum += qlp_coeff[16] * data[i-17];
388 case 16: sum += qlp_coeff[15] * data[i-16];
389 case 15: sum += qlp_coeff[14] * data[i-15];
390 case 14: sum += qlp_coeff[13] * data[i-14];
391 case 13: sum += qlp_coeff[12] * data[i-13];
392 sum += qlp_coeff[11] * data[i-12];
393 sum += qlp_coeff[10] * data[i-11];
394 sum += qlp_coeff[ 9] * data[i-10];
395 sum += qlp_coeff[ 8] * data[i- 9];
396 sum += qlp_coeff[ 7] * data[i- 8];
397 sum += qlp_coeff[ 6] * data[i- 7];
398 sum += qlp_coeff[ 5] * data[i- 6];
399 sum += qlp_coeff[ 4] * data[i- 5];
400 sum += qlp_coeff[ 3] * data[i- 4];
401 sum += qlp_coeff[ 2] * data[i- 3];
402 sum += qlp_coeff[ 1] * data[i- 2];
403 sum += qlp_coeff[ 0] * data[i- 1];
404 }
405 residual[i] = data[i] - (sum >> lp_quantization);
406 }
407 }
408}
409
410FLAC__SSE_TARGET("sse2")
411void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
412{
413 int i;
414
415 FLAC__ASSERT(order > 0);
416 FLAC__ASSERT(order <= 32);
417
418 if(order <= 12) {
419 if(order > 8) { /* order == 9, 10, 11, 12 */
420 if(order > 10) { /* order == 11, 12 */
421 if(order == 12) {
422 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
423 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
424 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
425 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
426 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
427 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
428 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
429
430 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
431 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
432 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
433 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
434 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
435 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
436
437 for(i = 0; i < (int)data_len; i++) {
438 //sum = 0;
439 //sum += qlp_coeff[11] * data[i-12];
440 //sum += qlp_coeff[10] * data[i-11];
441 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
442 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
443 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
444
445 //sum += qlp_coeff[9] * data[i-10];
446 //sum += qlp_coeff[8] * data[i-9];
447 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
448 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
449 xmm6 = _mm_mul_epu32(xmm6, xmm4);
450 xmm7 = _mm_add_epi32(xmm7, xmm6);
451
452 //sum += qlp_coeff[7] * data[i-8];
453 //sum += qlp_coeff[6] * data[i-7];
454 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
455 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
456 xmm6 = _mm_mul_epu32(xmm6, xmm3);
457 xmm7 = _mm_add_epi32(xmm7, xmm6);
458
459 //sum += qlp_coeff[5] * data[i-6];
460 //sum += qlp_coeff[4] * data[i-5];
461 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
462 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
463 xmm6 = _mm_mul_epu32(xmm6, xmm2);
464 xmm7 = _mm_add_epi32(xmm7, xmm6);
465
466 //sum += qlp_coeff[3] * data[i-4];
467 //sum += qlp_coeff[2] * data[i-3];
468 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
469 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
470 xmm6 = _mm_mul_epu32(xmm6, xmm1);
471 xmm7 = _mm_add_epi32(xmm7, xmm6);
472
473 //sum += qlp_coeff[1] * data[i-2];
474 //sum += qlp_coeff[0] * data[i-1];
475 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
476 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
477 xmm6 = _mm_mul_epu32(xmm6, xmm0);
478 xmm7 = _mm_add_epi32(xmm7, xmm6);
479
480 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
481 RESIDUAL32_RESULT(xmm7);
482 }
483 }
484 else { /* order == 11 */
485 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
486 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
487 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
488 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
489 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
490 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
491 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
492
493 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
494 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
495 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
496 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
497 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
498
499 for(i = 0; i < (int)data_len; i++) {
500 //sum = 0;
501 //sum = qlp_coeff[10] * data[i-11];
502 xmm7 = _mm_cvtsi32_si128(data[i-11]);
503 xmm7 = _mm_mul_epu32(xmm7, xmm5);
504
505 //sum += qlp_coeff[9] * data[i-10];
506 //sum += qlp_coeff[8] * data[i-9];
507 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
508 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
509 xmm6 = _mm_mul_epu32(xmm6, xmm4);
510 xmm7 = _mm_add_epi32(xmm7, xmm6);
511
512 //sum += qlp_coeff[7] * data[i-8];
513 //sum += qlp_coeff[6] * data[i-7];
514 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
515 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
516 xmm6 = _mm_mul_epu32(xmm6, xmm3);
517 xmm7 = _mm_add_epi32(xmm7, xmm6);
518
519 //sum += qlp_coeff[5] * data[i-6];
520 //sum += qlp_coeff[4] * data[i-5];
521 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
522 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
523 xmm6 = _mm_mul_epu32(xmm6, xmm2);
524 xmm7 = _mm_add_epi32(xmm7, xmm6);
525
526 //sum += qlp_coeff[3] * data[i-4];
527 //sum += qlp_coeff[2] * data[i-3];
528 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
529 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
530 xmm6 = _mm_mul_epu32(xmm6, xmm1);
531 xmm7 = _mm_add_epi32(xmm7, xmm6);
532
533 //sum += qlp_coeff[1] * data[i-2];
534 //sum += qlp_coeff[0] * data[i-1];
535 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
536 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
537 xmm6 = _mm_mul_epu32(xmm6, xmm0);
538 xmm7 = _mm_add_epi32(xmm7, xmm6);
539
540 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
541 RESIDUAL32_RESULT(xmm7);
542 }
543 }
544 }
545 else { /* order == 9, 10 */
546 if(order == 10) {
547 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
548 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
549 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
550 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
551 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
552 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
553
554 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
555 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
556 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
557 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
558 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
559
560 for(i = 0; i < (int)data_len; i++) {
561 //sum = 0;
562 //sum += qlp_coeff[9] * data[i-10];
563 //sum += qlp_coeff[8] * data[i-9];
564 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
565 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
566 xmm7 = _mm_mul_epu32(xmm7, xmm4);
567
568 //sum += qlp_coeff[7] * data[i-8];
569 //sum += qlp_coeff[6] * data[i-7];
570 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
571 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
572 xmm6 = _mm_mul_epu32(xmm6, xmm3);
573 xmm7 = _mm_add_epi32(xmm7, xmm6);
574
575 //sum += qlp_coeff[5] * data[i-6];
576 //sum += qlp_coeff[4] * data[i-5];
577 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
578 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
579 xmm6 = _mm_mul_epu32(xmm6, xmm2);
580 xmm7 = _mm_add_epi32(xmm7, xmm6);
581
582 //sum += qlp_coeff[3] * data[i-4];
583 //sum += qlp_coeff[2] * data[i-3];
584 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
585 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
586 xmm6 = _mm_mul_epu32(xmm6, xmm1);
587 xmm7 = _mm_add_epi32(xmm7, xmm6);
588
589 //sum += qlp_coeff[1] * data[i-2];
590 //sum += qlp_coeff[0] * data[i-1];
591 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
592 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
593 xmm6 = _mm_mul_epu32(xmm6, xmm0);
594 xmm7 = _mm_add_epi32(xmm7, xmm6);
595
596 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
597 RESIDUAL32_RESULT(xmm7);
598 }
599 }
600 else { /* order == 9 */
601 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
602 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
603 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
604 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
605 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
606 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
607
608 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
609 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
610 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
611 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
612
613 for(i = 0; i < (int)data_len; i++) {
614 //sum = 0;
615 //sum = qlp_coeff[8] * data[i-9];
616 xmm7 = _mm_cvtsi32_si128(data[i-9]);
617 xmm7 = _mm_mul_epu32(xmm7, xmm4);
618
619 //sum += qlp_coeff[7] * data[i-8];
620 //sum += qlp_coeff[6] * data[i-7];
621 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
622 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
623 xmm6 = _mm_mul_epu32(xmm6, xmm3);
624 xmm7 = _mm_add_epi32(xmm7, xmm6);
625
626 //sum += qlp_coeff[5] * data[i-6];
627 //sum += qlp_coeff[4] * data[i-5];
628 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
629 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
630 xmm6 = _mm_mul_epu32(xmm6, xmm2);
631 xmm7 = _mm_add_epi32(xmm7, xmm6);
632
633 //sum += qlp_coeff[3] * data[i-4];
634 //sum += qlp_coeff[2] * data[i-3];
635 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
636 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
637 xmm6 = _mm_mul_epu32(xmm6, xmm1);
638 xmm7 = _mm_add_epi32(xmm7, xmm6);
639
640 //sum += qlp_coeff[1] * data[i-2];
641 //sum += qlp_coeff[0] * data[i-1];
642 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
643 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
644 xmm6 = _mm_mul_epu32(xmm6, xmm0);
645 xmm7 = _mm_add_epi32(xmm7, xmm6);
646
647 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
648 RESIDUAL32_RESULT(xmm7);
649 }
650 }
651 }
652 }
653 else if(order > 4) { /* order == 5, 6, 7, 8 */
654 if(order > 6) { /* order == 7, 8 */
655 if(order == 8) {
656 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
657 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
658 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
659 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
660 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
661
662 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
663 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
664 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
665 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
666
667 for(i = 0; i < (int)data_len; i++) {
668 //sum = 0;
669 //sum += qlp_coeff[7] * data[i-8];
670 //sum += qlp_coeff[6] * data[i-7];
671 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
672 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
673 xmm7 = _mm_mul_epu32(xmm7, xmm3);
674
675 //sum += qlp_coeff[5] * data[i-6];
676 //sum += qlp_coeff[4] * data[i-5];
677 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
678 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
679 xmm6 = _mm_mul_epu32(xmm6, xmm2);
680 xmm7 = _mm_add_epi32(xmm7, xmm6);
681
682 //sum += qlp_coeff[3] * data[i-4];
683 //sum += qlp_coeff[2] * data[i-3];
684 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
685 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
686 xmm6 = _mm_mul_epu32(xmm6, xmm1);
687 xmm7 = _mm_add_epi32(xmm7, xmm6);
688
689 //sum += qlp_coeff[1] * data[i-2];
690 //sum += qlp_coeff[0] * data[i-1];
691 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
692 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
693 xmm6 = _mm_mul_epu32(xmm6, xmm0);
694 xmm7 = _mm_add_epi32(xmm7, xmm6);
695
696 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
697 RESIDUAL32_RESULT(xmm7);
698 }
699 }
700 else { /* order == 7 */
701 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
702 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
703 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
704 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
705 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
706
707 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
708 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
709 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
710
711 for(i = 0; i < (int)data_len; i++) {
712 //sum = 0;
713 //sum = qlp_coeff[6] * data[i-7];
714 xmm7 = _mm_cvtsi32_si128(data[i-7]);
715 xmm7 = _mm_mul_epu32(xmm7, xmm3);
716
717 //sum += qlp_coeff[5] * data[i-6];
718 //sum += qlp_coeff[4] * data[i-5];
719 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
720 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
721 xmm6 = _mm_mul_epu32(xmm6, xmm2);
722 xmm7 = _mm_add_epi32(xmm7, xmm6);
723
724 //sum += qlp_coeff[3] * data[i-4];
725 //sum += qlp_coeff[2] * data[i-3];
726 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
727 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
728 xmm6 = _mm_mul_epu32(xmm6, xmm1);
729 xmm7 = _mm_add_epi32(xmm7, xmm6);
730
731 //sum += qlp_coeff[1] * data[i-2];
732 //sum += qlp_coeff[0] * data[i-1];
733 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
734 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
735 xmm6 = _mm_mul_epu32(xmm6, xmm0);
736 xmm7 = _mm_add_epi32(xmm7, xmm6);
737
738 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
739 RESIDUAL32_RESULT(xmm7);
740 }
741 }
742 }
743 else { /* order == 5, 6 */
744 if(order == 6) {
745 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
746 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
747 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
748 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
749
750 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
751 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
752 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
753
754 for(i = 0; i < (int)data_len; i++) {
755 //sum = 0;
756 //sum += qlp_coeff[5] * data[i-6];
757 //sum += qlp_coeff[4] * data[i-5];
758 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
759 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
760 xmm7 = _mm_mul_epu32(xmm7, xmm2);
761
762 //sum += qlp_coeff[3] * data[i-4];
763 //sum += qlp_coeff[2] * data[i-3];
764 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
765 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
766 xmm6 = _mm_mul_epu32(xmm6, xmm1);
767 xmm7 = _mm_add_epi32(xmm7, xmm6);
768
769 //sum += qlp_coeff[1] * data[i-2];
770 //sum += qlp_coeff[0] * data[i-1];
771 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
772 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
773 xmm6 = _mm_mul_epu32(xmm6, xmm0);
774 xmm7 = _mm_add_epi32(xmm7, xmm6);
775
776 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
777 RESIDUAL32_RESULT(xmm7);
778 }
779 }
780 else { /* order == 5 */
781 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
782 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
783 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
784 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
785
786 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
787 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
788
789 for(i = 0; i < (int)data_len; i++) {
790 //sum = 0;
791 //sum = qlp_coeff[4] * data[i-5];
792 xmm7 = _mm_cvtsi32_si128(data[i-5]);
793 xmm7 = _mm_mul_epu32(xmm7, xmm2);
794
795 //sum += qlp_coeff[3] * data[i-4];
796 //sum += qlp_coeff[2] * data[i-3];
797 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
798 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
799 xmm6 = _mm_mul_epu32(xmm6, xmm1);
800 xmm7 = _mm_add_epi32(xmm7, xmm6);
801
802 //sum += qlp_coeff[1] * data[i-2];
803 //sum += qlp_coeff[0] * data[i-1];
804 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
805 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
806 xmm6 = _mm_mul_epu32(xmm6, xmm0);
807 xmm7 = _mm_add_epi32(xmm7, xmm6);
808
809 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
810 RESIDUAL32_RESULT(xmm7);
811 }
812 }
813 }
814 }
815 else { /* order == 1, 2, 3, 4 */
816 if(order > 2) { /* order == 3, 4 */
817 if(order == 4) {
818 __m128i xmm0, xmm1, xmm6, xmm7;
819 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
820 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
821
822 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
823 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
824
825 for(i = 0; i < (int)data_len; i++) {
826 //sum = 0;
827 //sum += qlp_coeff[3] * data[i-4];
828 //sum += qlp_coeff[2] * data[i-3];
829 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
830 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
831 xmm7 = _mm_mul_epu32(xmm7, xmm1);
832
833 //sum += qlp_coeff[1] * data[i-2];
834 //sum += qlp_coeff[0] * data[i-1];
835 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
836 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
837 xmm6 = _mm_mul_epu32(xmm6, xmm0);
838 xmm7 = _mm_add_epi32(xmm7, xmm6);
839
840 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
841 RESIDUAL32_RESULT(xmm7);
842 }
843 }
844 else { /* order == 3 */
845 __m128i xmm0, xmm1, xmm6, xmm7;
846 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
847 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
848
849 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
850
851 for(i = 0; i < (int)data_len; i++) {
852 //sum = 0;
853 //sum = qlp_coeff[2] * data[i-3];
854 xmm7 = _mm_cvtsi32_si128(data[i-3]);
855 xmm7 = _mm_mul_epu32(xmm7, xmm1);
856
857 //sum += qlp_coeff[1] * data[i-2];
858 //sum += qlp_coeff[0] * data[i-1];
859 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
860 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
861 xmm6 = _mm_mul_epu32(xmm6, xmm0);
862 xmm7 = _mm_add_epi32(xmm7, xmm6);
863
864 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
865 RESIDUAL32_RESULT(xmm7);
866 }
867 }
868 }
869 else { /* order == 1, 2 */
870 if(order == 2) {
871 __m128i xmm0, xmm7;
872 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
873 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
874
875 for(i = 0; i < (int)data_len; i++) {
876 //sum = 0;
877 //sum += qlp_coeff[1] * data[i-2];
878 //sum += qlp_coeff[0] * data[i-1];
879 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
880 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
881 xmm7 = _mm_mul_epu32(xmm7, xmm0);
882
883 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
884 RESIDUAL32_RESULT(xmm7);
885 }
886 }
887 else { /* order == 1 */
888 for(i = 0; i < (int)data_len; i++)
889 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
890 }
891 }
892 }
893 }
894 else { /* order > 12 */
895 FLAC__int32 sum;
896 for(i = 0; i < (int)data_len; i++) {
897 sum = 0;
898 switch(order) {
899 case 32: sum += qlp_coeff[31] * data[i-32];
900 case 31: sum += qlp_coeff[30] * data[i-31];
901 case 30: sum += qlp_coeff[29] * data[i-30];
902 case 29: sum += qlp_coeff[28] * data[i-29];
903 case 28: sum += qlp_coeff[27] * data[i-28];
904 case 27: sum += qlp_coeff[26] * data[i-27];
905 case 26: sum += qlp_coeff[25] * data[i-26];
906 case 25: sum += qlp_coeff[24] * data[i-25];
907 case 24: sum += qlp_coeff[23] * data[i-24];
908 case 23: sum += qlp_coeff[22] * data[i-23];
909 case 22: sum += qlp_coeff[21] * data[i-22];
910 case 21: sum += qlp_coeff[20] * data[i-21];
911 case 20: sum += qlp_coeff[19] * data[i-20];
912 case 19: sum += qlp_coeff[18] * data[i-19];
913 case 18: sum += qlp_coeff[17] * data[i-18];
914 case 17: sum += qlp_coeff[16] * data[i-17];
915 case 16: sum += qlp_coeff[15] * data[i-16];
916 case 15: sum += qlp_coeff[14] * data[i-15];
917 case 14: sum += qlp_coeff[13] * data[i-14];
918 case 13: sum += qlp_coeff[12] * data[i-13];
919 sum += qlp_coeff[11] * data[i-12];
920 sum += qlp_coeff[10] * data[i-11];
921 sum += qlp_coeff[ 9] * data[i-10];
922 sum += qlp_coeff[ 8] * data[i- 9];
923 sum += qlp_coeff[ 7] * data[i- 8];
924 sum += qlp_coeff[ 6] * data[i- 7];
925 sum += qlp_coeff[ 5] * data[i- 6];
926 sum += qlp_coeff[ 4] * data[i- 5];
927 sum += qlp_coeff[ 3] * data[i- 4];
928 sum += qlp_coeff[ 2] * data[i- 3];
929 sum += qlp_coeff[ 1] * data[i- 2];
930 sum += qlp_coeff[ 0] * data[i- 1];
931 }
932 residual[i] = data[i] - (sum >> lp_quantization);
933 }
934 }
935}
936
937#if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
938
939FLAC__SSE_TARGET("sse2")
940void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
941{
942 if (order < 8 || order > 12) {
943 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
944 return;
945 }
946 if (data_len == 0)
947 return;
948
949 FLAC__ASSERT(order >= 8);
950 FLAC__ASSERT(order <= 12);
951
952 if(order > 8) { /* order == 9, 10, 11, 12 */
953 FLAC__int32 curr;
954 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
955 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
956 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
957 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
958 switch(order) /* ...and zero them out */
959 {
960 case 9:
961 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
962 case 10:
963 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
964 case 11:
965 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
966 }
967 xmm2 = _mm_setzero_si128();
968 xmm0 = _mm_packs_epi32(xmm0, xmm6);
969 xmm1 = _mm_packs_epi32(xmm1, xmm2);
970
971 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
972 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
973 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
974 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
975 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
976 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
977 xmm4 = _mm_packs_epi32(xmm4, xmm2);
978 xmm3 = _mm_packs_epi32(xmm3, xmm5);
979
980 xmm7 = _mm_slli_si128(xmm1, 2);
981 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
982 xmm2 = _mm_slli_si128(xmm0, 2);
983
984 /* xmm0, xmm1: qlp_coeff
985 xmm2, xmm7: qlp_coeff << 16 bit
986 xmm3, xmm4: data */
987
988 xmm5 = _mm_madd_epi16(xmm4, xmm1);
989 xmm6 = _mm_madd_epi16(xmm3, xmm0);
990 xmm6 = _mm_add_epi32(xmm6, xmm5);
991 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
992 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
993
994 DATA16_RESULT(xmm6);
995
996 data_len--;
997
998 if(data_len % 2) {
999 xmm6 = _mm_srli_si128(xmm3, 14);
1000 xmm4 = _mm_slli_si128(xmm4, 2);
1001 xmm3 = _mm_slli_si128(xmm3, 2);
1002 xmm4 = _mm_or_si128(xmm4, xmm6);
1003 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1004
1005 xmm5 = _mm_madd_epi16(xmm4, xmm1);
1006 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1007 xmm6 = _mm_add_epi32(xmm6, xmm5);
1008 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1009 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1010
1011 DATA16_RESULT(xmm6);
1012
1013 data_len--;
1014 }
1015
1016 while(data_len) { /* data_len is a multiple of 2 */
1017 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1018 xmm6 = _mm_srli_si128(xmm3, 12);
1019 xmm4 = _mm_slli_si128(xmm4, 4);
1020 xmm3 = _mm_slli_si128(xmm3, 4);
1021 xmm4 = _mm_or_si128(xmm4, xmm6);
1022 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1023
1024 xmm5 = _mm_madd_epi16(xmm4, xmm7);
1025 xmm6 = _mm_madd_epi16(xmm3, xmm2);
1026 xmm6 = _mm_add_epi32(xmm6, xmm5);
1027 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1028 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1029
1030 DATA16_RESULT(xmm6);
1031
1032 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1033
1034 xmm5 = _mm_madd_epi16(xmm4, xmm1);
1035 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1036 xmm6 = _mm_add_epi32(xmm6, xmm5);
1037 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1038 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1039
1040 DATA16_RESULT(xmm6);
1041
1042 data_len-=2;
1043 }
1044 } /* endif(order > 8) */
1045 else
1046 {
1047 FLAC__int32 curr;
1048 __m128i xmm0, xmm1, xmm3, xmm6;
1049 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1050 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1051 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1052
1053 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1054 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1055 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1056 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1057 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1058
1059 /* xmm0: qlp_coeff
1060 xmm3: data */
1061
1062 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1063 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1064 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1065
1066 DATA16_RESULT(xmm6);
1067
1068 data_len--;
1069
1070 while(data_len) {
1071 xmm3 = _mm_slli_si128(xmm3, 2);
1072 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1073
1074 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1075 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1076 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1077
1078 DATA16_RESULT(xmm6);
1079
1080 data_len--;
1081 }
1082 }
1083}
1084
1085#endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
1086
1087#endif /* FLAC__SSE2_SUPPORTED */
1088#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1089#endif /* FLAC__NO_ASM */
1090#endif /* FLAC__INTEGER_ONLY_LIBRARY */