Commit | Line | Data |
---|---|---|
3719602c PC |
1 | /* Copyright (C) 2010-2020 The RetroArch team |
2 | * | |
3 | * --------------------------------------------------------------------------------------- | |
4 | * The following license statement only applies to this file (scaler_int.c). | |
5 | * --------------------------------------------------------------------------------------- | |
6 | * | |
7 | * Permission is hereby granted, free of charge, | |
8 | * to any person obtaining a copy of this software and associated documentation files (the "Software"), | |
9 | * to deal in the Software without restriction, including without limitation the rights to | |
10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
11 | * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | |
12 | * | |
13 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
16 | * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
19 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
21 | */ | |
22 | ||
23 | #include <gfx/scaler/scaler_int.h> | |
24 | ||
25 | #include <retro_inline.h> | |
26 | ||
27 | #ifdef SCALER_NO_SIMD | |
28 | #undef __SSE2__ | |
29 | #endif | |
30 | ||
31 | #if defined(__SSE2__) | |
32 | #include <emmintrin.h> | |
33 | #ifdef _WIN32 | |
34 | #include <intrin.h> | |
35 | #endif | |
36 | #endif | |
37 | ||
38 | /* ARGB8888 scaler is split in two: | |
39 | * | |
40 | * First, horizontal scaler is applied. | |
41 | * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 | |
42 | * to left to occupy 15 bits. | |
43 | * | |
44 | * The sign bit is kept empty as we have to do signed multiplication for the | |
45 | * filter. | |
46 | * | |
47 | * A mulhi [(a * b) >> 16] is applied which loses some precision, but is | |
48 | * very efficient for SIMD. | |
49 | * It is accurate enough for 8-bit purposes. | |
50 | * | |
51 | * The fixed point 1.0 for filter is (1 << 14). After horizontal scale, | |
52 | * the output is kept with 16-bit channels, and will now have 13 bits | |
53 | * of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2. | |
54 | * | |
55 | * Vertical scaler takes the 13 bit channels, and performs the | |
56 | * same mulhi steps. | |
57 | * Another 2 bits of precision is lost, which ends up as 11 bits. | |
58 | * Scaling is now complete. Channels are shifted right by 3, and saturated | |
59 | * into 8-bit values. | |
60 | * | |
61 | * The C version of scalers perform the exact same operations as the | |
62 | * SIMD code for testing purposes. | |
63 | */ | |
64 | ||
65 | void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride) | |
66 | { | |
67 | int h, w, y; | |
68 | const uint64_t *input = ctx->scaled.frame; | |
69 | uint32_t *output = (uint32_t*)output_; | |
70 | ||
71 | const int16_t *filter_vert = ctx->vert.filter; | |
72 | ||
73 | for (h = 0; h < ctx->out_height; h++, | |
74 | filter_vert += ctx->vert.filter_stride, output += stride >> 2) | |
75 | { | |
76 | const uint64_t *input_base = input + ctx->vert.filter_pos[h] | |
77 | * (ctx->scaled.stride >> 3); | |
78 | ||
79 | for (w = 0; w < ctx->out_width; w++) | |
80 | { | |
81 | const uint64_t *input_base_y = input_base + w; | |
82 | #if defined(__SSE2__) | |
83 | __m128i final; | |
84 | __m128i res = _mm_setzero_si128(); | |
85 | ||
86 | for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, | |
87 | input_base_y += (ctx->scaled.stride >> 2)) | |
88 | { | |
89 | __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll); | |
90 | __m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]); | |
91 | ||
92 | res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); | |
93 | } | |
94 | ||
95 | for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3)) | |
96 | { | |
97 | __m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll); | |
98 | __m128i col = _mm_set_epi64x(0, input_base_y[0]); | |
99 | ||
100 | res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); | |
101 | } | |
102 | ||
103 | res = _mm_adds_epi16(_mm_srli_si128(res, 8), res); | |
104 | res = _mm_srai_epi16(res, (7 - 2 - 2)); | |
105 | ||
106 | final = _mm_packus_epi16(res, res); | |
107 | ||
108 | output[w] = _mm_cvtsi128_si32(final); | |
109 | #else | |
110 | int16_t res_a = 0; | |
111 | int16_t res_r = 0; | |
112 | int16_t res_g = 0; | |
113 | int16_t res_b = 0; | |
114 | ||
115 | for (y = 0; y < ctx->vert.filter_len; y++, | |
116 | input_base_y += (ctx->scaled.stride >> 3)) | |
117 | { | |
118 | uint64_t col = *input_base_y; | |
119 | ||
120 | int16_t a = (col >> 48) & 0xffff; | |
121 | int16_t r = (col >> 32) & 0xffff; | |
122 | int16_t g = (col >> 16) & 0xffff; | |
123 | int16_t b = (col >> 0) & 0xffff; | |
124 | ||
125 | int16_t coeff = filter_vert[y]; | |
126 | ||
127 | res_a += (a * coeff) >> 16; | |
128 | res_r += (r * coeff) >> 16; | |
129 | res_g += (g * coeff) >> 16; | |
130 | res_b += (b * coeff) >> 16; | |
131 | } | |
132 | ||
133 | res_a >>= (7 - 2 - 2); | |
134 | res_r >>= (7 - 2 - 2); | |
135 | res_g >>= (7 - 2 - 2); | |
136 | res_b >>= (7 - 2 - 2); | |
137 | ||
138 | output[w] = | |
139 | (clamp_8bit(res_a) << 24) | | |
140 | (clamp_8bit(res_r) << 16) | | |
141 | (clamp_8bit(res_g) << 8) | | |
142 | (clamp_8bit(res_b) << 0); | |
143 | #endif | |
144 | } | |
145 | } | |
146 | } | |
147 | ||
148 | void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride) | |
149 | { | |
150 | int h, w, x; | |
151 | const uint32_t *input = (uint32_t*)input_; | |
152 | uint64_t *output = ctx->scaled.frame; | |
153 | ||
154 | for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, | |
155 | output += ctx->scaled.stride >> 3) | |
156 | { | |
157 | const int16_t *filter_horiz = ctx->horiz.filter; | |
158 | ||
159 | for (w = 0; w < ctx->scaled.width; w++, | |
160 | filter_horiz += ctx->horiz.filter_stride) | |
161 | { | |
162 | const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w]; | |
163 | #if defined(__SSE2__) | |
164 | __m128i res = _mm_setzero_si128(); | |
165 | #ifndef __x86_64__ | |
166 | union | |
167 | { | |
168 | uint32_t *u32; | |
169 | uint64_t *u64; | |
170 | } u; | |
171 | #endif | |
172 | for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2) | |
173 | { | |
174 | __m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll); | |
175 | ||
176 | __m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0, | |
177 | ((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128()); | |
178 | ||
179 | col = _mm_slli_epi16(col, 7); | |
180 | res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); | |
181 | } | |
182 | ||
183 | for (; x < ctx->horiz.filter_len; x++) | |
184 | { | |
185 | __m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll); | |
186 | __m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128()); | |
187 | ||
188 | col = _mm_slli_epi16(col, 7); | |
189 | res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); | |
190 | } | |
191 | ||
192 | res = _mm_adds_epi16(_mm_srli_si128(res, 8), res); | |
193 | ||
194 | #ifdef __x86_64__ | |
195 | output[w] = _mm_cvtsi128_si64(res); | |
196 | #else /* 32-bit doesn't have si64. Do it in two steps. */ | |
197 | u.u64 = output + w; | |
198 | u.u32[0] = _mm_cvtsi128_si32(res); | |
199 | u.u32[1] = _mm_cvtsi128_si32(_mm_srli_si128(res, 4)); | |
200 | #endif | |
201 | #else | |
202 | int16_t res_a = 0; | |
203 | int16_t res_r = 0; | |
204 | int16_t res_g = 0; | |
205 | int16_t res_b = 0; | |
206 | ||
207 | for (x = 0; x < ctx->horiz.filter_len; x++) | |
208 | { | |
209 | uint32_t col = input_base_x[x]; | |
210 | ||
211 | int16_t a = (col >> (24 - 7)) & (0xff << 7); | |
212 | int16_t r = (col >> (16 - 7)) & (0xff << 7); | |
213 | int16_t g = (col >> ( 8 - 7)) & (0xff << 7); | |
214 | int16_t b = (col << ( 0 + 7)) & (0xff << 7); | |
215 | ||
216 | int16_t coeff = filter_horiz[x]; | |
217 | ||
218 | res_a += (a * coeff) >> 16; | |
219 | res_r += (r * coeff) >> 16; | |
220 | res_g += (g * coeff) >> 16; | |
221 | res_b += (b * coeff) >> 16; | |
222 | } | |
223 | ||
224 | output[w] = ( | |
225 | (uint64_t)res_a << 48) | | |
226 | ((uint64_t)res_r << 32) | | |
227 | ((uint64_t)res_g << 16) | | |
228 | ((uint64_t)res_b << 0); | |
229 | #endif | |
230 | } | |
231 | } | |
232 | } | |
233 | ||
234 | void scaler_argb8888_point_special(const struct scaler_ctx *ctx, | |
235 | void *output_, const void *input_, | |
236 | int out_width, int out_height, | |
237 | int in_width, int in_height, | |
238 | int out_stride, int in_stride) | |
239 | { | |
240 | int h, w; | |
241 | int x_pos = (1 << 15) * in_width / out_width - (1 << 15); | |
242 | int x_step = (1 << 16) * in_width / out_width; | |
243 | int y_pos = (1 << 15) * in_height / out_height - (1 << 15); | |
244 | int y_step = (1 << 16) * in_height / out_height; | |
245 | const uint32_t *input = (const uint32_t*)input_; | |
246 | uint32_t *output = (uint32_t*)output_; | |
247 | ||
248 | if (x_pos < 0) | |
249 | x_pos = 0; | |
250 | if (y_pos < 0) | |
251 | y_pos = 0; | |
252 | ||
253 | for (h = 0; h < out_height; h++, y_pos += y_step, output += out_stride >> 2) | |
254 | { | |
255 | int x = x_pos; | |
256 | const uint32_t *inp = input + (y_pos >> 16) * (in_stride >> 2); | |
257 | ||
258 | for (w = 0; w < out_width; w++, x += x_step) | |
259 | output[w] = inp[x >> 16]; | |
260 | } | |
261 | } |