| 1 | /* Copyright (C) 2010-2020 The RetroArch team |
| 2 | * |
| 3 | * --------------------------------------------------------------------------------------- |
| 4 | * The following license statement only applies to this file (scaler_int.c). |
| 5 | * --------------------------------------------------------------------------------------- |
| 6 | * |
| 7 | * Permission is hereby granted, free of charge, |
| 8 | * to any person obtaining a copy of this software and associated documentation files (the "Software"), |
| 9 | * to deal in the Software without restriction, including without limitation the rights to |
| 10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, |
| 11 | * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. |
| 14 | * |
| 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, |
| 16 | * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| 19 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 21 | */ |
| 22 | |
| 23 | #include <gfx/scaler/scaler_int.h> |
| 24 | |
| 25 | #include <retro_inline.h> |
| 26 | |
| 27 | #ifdef SCALER_NO_SIMD |
| 28 | #undef __SSE2__ |
| 29 | #endif |
| 30 | |
| 31 | #if defined(__SSE2__) |
| 32 | #include <emmintrin.h> |
| 33 | #ifdef _WIN32 |
| 34 | #include <intrin.h> |
| 35 | #endif |
| 36 | #endif |
| 37 | |
| 38 | /* ARGB8888 scaler is split in two: |
| 39 | * |
| 40 | * First, horizontal scaler is applied. |
| 41 | * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 |
| 42 | * to left to occupy 15 bits. |
| 43 | * |
| 44 | * The sign bit is kept empty as we have to do signed multiplication for the |
| 45 | * filter. |
| 46 | * |
| 47 | * A mulhi [(a * b) >> 16] is applied which loses some precision, but is |
| 48 | * very efficient for SIMD. |
| 49 | * It is accurate enough for 8-bit purposes. |
| 50 | * |
| 51 | * The fixed point 1.0 for filter is (1 << 14). After horizontal scale, |
| 52 | * the output is kept with 16-bit channels, and will now have 13 bits |
| 53 | * of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2. |
| 54 | * |
| 55 | * Vertical scaler takes the 13 bit channels, and performs the |
| 56 | * same mulhi steps. |
| 57 | * Another 2 bits of precision is lost, which ends up as 11 bits. |
| 58 | * Scaling is now complete. Channels are shifted right by 3, and saturated |
| 59 | * into 8-bit values. |
| 60 | * |
| 61 | * The C version of scalers perform the exact same operations as the |
| 62 | * SIMD code for testing purposes. |
| 63 | */ |
| 64 | |
| 65 | void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride) |
| 66 | { |
| 67 | int h, w, y; |
| 68 | const uint64_t *input = ctx->scaled.frame; |
| 69 | uint32_t *output = (uint32_t*)output_; |
| 70 | |
| 71 | const int16_t *filter_vert = ctx->vert.filter; |
| 72 | |
| 73 | for (h = 0; h < ctx->out_height; h++, |
| 74 | filter_vert += ctx->vert.filter_stride, output += stride >> 2) |
| 75 | { |
| 76 | const uint64_t *input_base = input + ctx->vert.filter_pos[h] |
| 77 | * (ctx->scaled.stride >> 3); |
| 78 | |
| 79 | for (w = 0; w < ctx->out_width; w++) |
| 80 | { |
| 81 | const uint64_t *input_base_y = input_base + w; |
| 82 | #if defined(__SSE2__) |
| 83 | __m128i final; |
| 84 | __m128i res = _mm_setzero_si128(); |
| 85 | |
| 86 | for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, |
| 87 | input_base_y += (ctx->scaled.stride >> 2)) |
| 88 | { |
| 89 | __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll); |
| 90 | __m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]); |
| 91 | |
| 92 | res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); |
| 93 | } |
| 94 | |
| 95 | for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3)) |
| 96 | { |
| 97 | __m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll); |
| 98 | __m128i col = _mm_set_epi64x(0, input_base_y[0]); |
| 99 | |
| 100 | res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); |
| 101 | } |
| 102 | |
| 103 | res = _mm_adds_epi16(_mm_srli_si128(res, 8), res); |
| 104 | res = _mm_srai_epi16(res, (7 - 2 - 2)); |
| 105 | |
| 106 | final = _mm_packus_epi16(res, res); |
| 107 | |
| 108 | output[w] = _mm_cvtsi128_si32(final); |
| 109 | #else |
| 110 | int16_t res_a = 0; |
| 111 | int16_t res_r = 0; |
| 112 | int16_t res_g = 0; |
| 113 | int16_t res_b = 0; |
| 114 | |
| 115 | for (y = 0; y < ctx->vert.filter_len; y++, |
| 116 | input_base_y += (ctx->scaled.stride >> 3)) |
| 117 | { |
| 118 | uint64_t col = *input_base_y; |
| 119 | |
| 120 | int16_t a = (col >> 48) & 0xffff; |
| 121 | int16_t r = (col >> 32) & 0xffff; |
| 122 | int16_t g = (col >> 16) & 0xffff; |
| 123 | int16_t b = (col >> 0) & 0xffff; |
| 124 | |
| 125 | int16_t coeff = filter_vert[y]; |
| 126 | |
| 127 | res_a += (a * coeff) >> 16; |
| 128 | res_r += (r * coeff) >> 16; |
| 129 | res_g += (g * coeff) >> 16; |
| 130 | res_b += (b * coeff) >> 16; |
| 131 | } |
| 132 | |
| 133 | res_a >>= (7 - 2 - 2); |
| 134 | res_r >>= (7 - 2 - 2); |
| 135 | res_g >>= (7 - 2 - 2); |
| 136 | res_b >>= (7 - 2 - 2); |
| 137 | |
| 138 | output[w] = |
| 139 | (clamp_8bit(res_a) << 24) | |
| 140 | (clamp_8bit(res_r) << 16) | |
| 141 | (clamp_8bit(res_g) << 8) | |
| 142 | (clamp_8bit(res_b) << 0); |
| 143 | #endif |
| 144 | } |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride) |
| 149 | { |
| 150 | int h, w, x; |
| 151 | const uint32_t *input = (uint32_t*)input_; |
| 152 | uint64_t *output = ctx->scaled.frame; |
| 153 | |
| 154 | for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, |
| 155 | output += ctx->scaled.stride >> 3) |
| 156 | { |
| 157 | const int16_t *filter_horiz = ctx->horiz.filter; |
| 158 | |
| 159 | for (w = 0; w < ctx->scaled.width; w++, |
| 160 | filter_horiz += ctx->horiz.filter_stride) |
| 161 | { |
| 162 | const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w]; |
| 163 | #if defined(__SSE2__) |
| 164 | __m128i res = _mm_setzero_si128(); |
| 165 | #ifndef __x86_64__ |
| 166 | union |
| 167 | { |
| 168 | uint32_t *u32; |
| 169 | uint64_t *u64; |
| 170 | } u; |
| 171 | #endif |
| 172 | for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2) |
| 173 | { |
| 174 | __m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll); |
| 175 | |
| 176 | __m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0, |
| 177 | ((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128()); |
| 178 | |
| 179 | col = _mm_slli_epi16(col, 7); |
| 180 | res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); |
| 181 | } |
| 182 | |
| 183 | for (; x < ctx->horiz.filter_len; x++) |
| 184 | { |
| 185 | __m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll); |
| 186 | __m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128()); |
| 187 | |
| 188 | col = _mm_slli_epi16(col, 7); |
| 189 | res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); |
| 190 | } |
| 191 | |
| 192 | res = _mm_adds_epi16(_mm_srli_si128(res, 8), res); |
| 193 | |
| 194 | #ifdef __x86_64__ |
| 195 | output[w] = _mm_cvtsi128_si64(res); |
| 196 | #else /* 32-bit doesn't have si64. Do it in two steps. */ |
| 197 | u.u64 = output + w; |
| 198 | u.u32[0] = _mm_cvtsi128_si32(res); |
| 199 | u.u32[1] = _mm_cvtsi128_si32(_mm_srli_si128(res, 4)); |
| 200 | #endif |
| 201 | #else |
| 202 | int16_t res_a = 0; |
| 203 | int16_t res_r = 0; |
| 204 | int16_t res_g = 0; |
| 205 | int16_t res_b = 0; |
| 206 | |
| 207 | for (x = 0; x < ctx->horiz.filter_len; x++) |
| 208 | { |
| 209 | uint32_t col = input_base_x[x]; |
| 210 | |
| 211 | int16_t a = (col >> (24 - 7)) & (0xff << 7); |
| 212 | int16_t r = (col >> (16 - 7)) & (0xff << 7); |
| 213 | int16_t g = (col >> ( 8 - 7)) & (0xff << 7); |
| 214 | int16_t b = (col << ( 0 + 7)) & (0xff << 7); |
| 215 | |
| 216 | int16_t coeff = filter_horiz[x]; |
| 217 | |
| 218 | res_a += (a * coeff) >> 16; |
| 219 | res_r += (r * coeff) >> 16; |
| 220 | res_g += (g * coeff) >> 16; |
| 221 | res_b += (b * coeff) >> 16; |
| 222 | } |
| 223 | |
| 224 | output[w] = ( |
| 225 | (uint64_t)res_a << 48) | |
| 226 | ((uint64_t)res_r << 32) | |
| 227 | ((uint64_t)res_g << 16) | |
| 228 | ((uint64_t)res_b << 0); |
| 229 | #endif |
| 230 | } |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | void scaler_argb8888_point_special(const struct scaler_ctx *ctx, |
| 235 | void *output_, const void *input_, |
| 236 | int out_width, int out_height, |
| 237 | int in_width, int in_height, |
| 238 | int out_stride, int in_stride) |
| 239 | { |
| 240 | int h, w; |
| 241 | int x_pos = (1 << 15) * in_width / out_width - (1 << 15); |
| 242 | int x_step = (1 << 16) * in_width / out_width; |
| 243 | int y_pos = (1 << 15) * in_height / out_height - (1 << 15); |
| 244 | int y_step = (1 << 16) * in_height / out_height; |
| 245 | const uint32_t *input = (const uint32_t*)input_; |
| 246 | uint32_t *output = (uint32_t*)output_; |
| 247 | |
| 248 | if (x_pos < 0) |
| 249 | x_pos = 0; |
| 250 | if (y_pos < 0) |
| 251 | y_pos = 0; |
| 252 | |
| 253 | for (h = 0; h < out_height; h++, y_pos += y_step, output += out_stride >> 2) |
| 254 | { |
| 255 | int x = x_pos; |
| 256 | const uint32_t *inp = input + (y_pos >> 16) * (in_stride >> 2); |
| 257 | |
| 258 | for (w = 0; w < out_width; w++, x += x_step) |
| 259 | output[w] = inp[x >> 16]; |
| 260 | } |
| 261 | } |