standalone: fix w/h confusion
[pcsx_rearmed.git] / deps / libretro-common / gfx / scaler / scaler_int.c
CommitLineData
3719602c
PC
1/* Copyright (C) 2010-2020 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (scaler_int.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include <gfx/scaler/scaler_int.h>
24
25#include <retro_inline.h>
26
27#ifdef SCALER_NO_SIMD
28#undef __SSE2__
29#endif
30
31#if defined(__SSE2__)
32#include <emmintrin.h>
33#ifdef _WIN32
34#include <intrin.h>
35#endif
36#endif
37
38/* ARGB8888 scaler is split in two:
39 *
40 * First, horizontal scaler is applied.
41 * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7
42 * to left to occupy 15 bits.
43 *
44 * The sign bit is kept empty as we have to do signed multiplication for the
45 * filter.
46 *
47 * A mulhi [(a * b) >> 16] is applied which loses some precision, but is
48 * very efficient for SIMD.
49 * It is accurate enough for 8-bit purposes.
50 *
51 * The fixed point 1.0 for filter is (1 << 14). After horizontal scale,
52 * the output is kept with 16-bit channels, and will now have 13 bits
53 * of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
54 *
55 * Vertical scaler takes the 13 bit channels, and performs the
56 * same mulhi steps.
57 * Another 2 bits of precision is lost, which ends up as 11 bits.
58 * Scaling is now complete. Channels are shifted right by 3, and saturated
59 * into 8-bit values.
60 *
61 * The C version of scalers perform the exact same operations as the
62 * SIMD code for testing purposes.
63 */
64
65void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
66{
67 int h, w, y;
68 const uint64_t *input = ctx->scaled.frame;
69 uint32_t *output = (uint32_t*)output_;
70
71 const int16_t *filter_vert = ctx->vert.filter;
72
73 for (h = 0; h < ctx->out_height; h++,
74 filter_vert += ctx->vert.filter_stride, output += stride >> 2)
75 {
76 const uint64_t *input_base = input + ctx->vert.filter_pos[h]
77 * (ctx->scaled.stride >> 3);
78
79 for (w = 0; w < ctx->out_width; w++)
80 {
81 const uint64_t *input_base_y = input_base + w;
82#if defined(__SSE2__)
83 __m128i final;
84 __m128i res = _mm_setzero_si128();
85
86 for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
87 input_base_y += (ctx->scaled.stride >> 2))
88 {
89 __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
90 __m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
91
92 res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
93 }
94
95 for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
96 {
97 __m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
98 __m128i col = _mm_set_epi64x(0, input_base_y[0]);
99
100 res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
101 }
102
103 res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
104 res = _mm_srai_epi16(res, (7 - 2 - 2));
105
106 final = _mm_packus_epi16(res, res);
107
108 output[w] = _mm_cvtsi128_si32(final);
109#else
110 int16_t res_a = 0;
111 int16_t res_r = 0;
112 int16_t res_g = 0;
113 int16_t res_b = 0;
114
115 for (y = 0; y < ctx->vert.filter_len; y++,
116 input_base_y += (ctx->scaled.stride >> 3))
117 {
118 uint64_t col = *input_base_y;
119
120 int16_t a = (col >> 48) & 0xffff;
121 int16_t r = (col >> 32) & 0xffff;
122 int16_t g = (col >> 16) & 0xffff;
123 int16_t b = (col >> 0) & 0xffff;
124
125 int16_t coeff = filter_vert[y];
126
127 res_a += (a * coeff) >> 16;
128 res_r += (r * coeff) >> 16;
129 res_g += (g * coeff) >> 16;
130 res_b += (b * coeff) >> 16;
131 }
132
133 res_a >>= (7 - 2 - 2);
134 res_r >>= (7 - 2 - 2);
135 res_g >>= (7 - 2 - 2);
136 res_b >>= (7 - 2 - 2);
137
138 output[w] =
139 (clamp_8bit(res_a) << 24) |
140 (clamp_8bit(res_r) << 16) |
141 (clamp_8bit(res_g) << 8) |
142 (clamp_8bit(res_b) << 0);
143#endif
144 }
145 }
146}
147
148void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
149{
150 int h, w, x;
151 const uint32_t *input = (uint32_t*)input_;
152 uint64_t *output = ctx->scaled.frame;
153
154 for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
155 output += ctx->scaled.stride >> 3)
156 {
157 const int16_t *filter_horiz = ctx->horiz.filter;
158
159 for (w = 0; w < ctx->scaled.width; w++,
160 filter_horiz += ctx->horiz.filter_stride)
161 {
162 const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
163#if defined(__SSE2__)
164 __m128i res = _mm_setzero_si128();
165#ifndef __x86_64__
166 union
167 {
168 uint32_t *u32;
169 uint64_t *u64;
170 } u;
171#endif
172 for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2)
173 {
174 __m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
175
176 __m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
177 ((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
178
179 col = _mm_slli_epi16(col, 7);
180 res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
181 }
182
183 for (; x < ctx->horiz.filter_len; x++)
184 {
185 __m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
186 __m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
187
188 col = _mm_slli_epi16(col, 7);
189 res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
190 }
191
192 res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
193
194#ifdef __x86_64__
195 output[w] = _mm_cvtsi128_si64(res);
196#else /* 32-bit doesn't have si64. Do it in two steps. */
197 u.u64 = output + w;
198 u.u32[0] = _mm_cvtsi128_si32(res);
199 u.u32[1] = _mm_cvtsi128_si32(_mm_srli_si128(res, 4));
200#endif
201#else
202 int16_t res_a = 0;
203 int16_t res_r = 0;
204 int16_t res_g = 0;
205 int16_t res_b = 0;
206
207 for (x = 0; x < ctx->horiz.filter_len; x++)
208 {
209 uint32_t col = input_base_x[x];
210
211 int16_t a = (col >> (24 - 7)) & (0xff << 7);
212 int16_t r = (col >> (16 - 7)) & (0xff << 7);
213 int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
214 int16_t b = (col << ( 0 + 7)) & (0xff << 7);
215
216 int16_t coeff = filter_horiz[x];
217
218 res_a += (a * coeff) >> 16;
219 res_r += (r * coeff) >> 16;
220 res_g += (g * coeff) >> 16;
221 res_b += (b * coeff) >> 16;
222 }
223
224 output[w] = (
225 (uint64_t)res_a << 48) |
226 ((uint64_t)res_r << 32) |
227 ((uint64_t)res_g << 16) |
228 ((uint64_t)res_b << 0);
229#endif
230 }
231 }
232}
233
234void scaler_argb8888_point_special(const struct scaler_ctx *ctx,
235 void *output_, const void *input_,
236 int out_width, int out_height,
237 int in_width, int in_height,
238 int out_stride, int in_stride)
239{
240 int h, w;
241 int x_pos = (1 << 15) * in_width / out_width - (1 << 15);
242 int x_step = (1 << 16) * in_width / out_width;
243 int y_pos = (1 << 15) * in_height / out_height - (1 << 15);
244 int y_step = (1 << 16) * in_height / out_height;
245 const uint32_t *input = (const uint32_t*)input_;
246 uint32_t *output = (uint32_t*)output_;
247
248 if (x_pos < 0)
249 x_pos = 0;
250 if (y_pos < 0)
251 y_pos = 0;
252
253 for (h = 0; h < out_height; h++, y_pos += y_step, output += out_stride >> 2)
254 {
255 int x = x_pos;
256 const uint32_t *inp = input + (y_pos >> 16) * (in_stride >> 2);
257
258 for (w = 0; w < out_width; w++, x += x_step)
259 output[w] = inp[x >> 16];
260 }
261}