| 1 | /* |
| 2 | * SDL - Simple DirectMedia Layer |
| 3 | * CELL BE Support for PS3 Framebuffer |
| 4 | * Copyright (C) 2008, 2009 International Business Machines Corporation |
| 5 | * |
| 6 | * This library is free software; you can redistribute it and/or modify it |
| 7 | * under the terms of the GNU Lesser General Public License as published |
| 8 | * by the Free Software Foundation; either version 2.1 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | * |
| 11 | * This library is distributed in the hope that it will be useful, but |
| 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | * Lesser General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU Lesser General Public |
| 17 | * License along with this library; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 |
| 19 | * USA |
| 20 | * |
| 21 | * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com> |
| 22 | * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com> |
| 23 | * SPE code based on research by: |
| 24 | * Rene Becker |
| 25 | * Thimo Emmerich |
| 26 | */ |
| 27 | |
| 28 | #include "spu_common.h" |
| 29 | |
| 30 | #include <spu_intrinsics.h> |
| 31 | #include <spu_mfcio.h> |
| 32 | |
| 33 | // Debugging |
| 34 | //#define DEBUG |
| 35 | |
| 36 | #ifdef DEBUG |
| 37 | #define deprintf(fmt, args... ) \ |
| 38 | fprintf( stdout, fmt, ##args ); \ |
| 39 | fflush( stdout ); |
| 40 | #else |
| 41 | #define deprintf( fmt, args... ) |
| 42 | #endif |
| 43 | |
| 44 | struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128))); |
| 45 | |
| 46 | /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored |
| 47 | * there might be the need to retrieve misaligned data, adjust |
| 48 | * incoming v and u plane to be able to handle this (add 128) |
| 49 | */ |
| 50 | unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128))); |
| 51 | unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128))); |
| 52 | unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128))); |
| 53 | |
| 54 | /* A maximum of 4 lines BGRA are stored, 4 byte per pixel */ |
| 55 | unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128))); |
| 56 | |
| 57 | /* some vectors needed by the float to int conversion */ |
| 58 | static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f }; |
| 59 | static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f }; |
| 60 | |
| 61 | void yuv_to_rgb_w16(); |
| 62 | void yuv_to_rgb_w32(); |
| 63 | |
| 64 | void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width); |
| 65 | void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width); |
| 66 | |
| 67 | |
| 68 | int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused))) |
| 69 | { |
| 70 | deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id); |
| 71 | uint32_t ea_mfc, mbox; |
| 72 | // send ready message |
| 73 | spu_write_out_mbox(SPU_READY); |
| 74 | |
| 75 | while (1) { |
| 76 | /* Check mailbox */ |
| 77 | mbox = spu_read_in_mbox(); |
| 78 | deprintf("[SPU] Message is %u\n", mbox); |
| 79 | switch (mbox) { |
| 80 | case SPU_EXIT: |
| 81 | deprintf("[SPU] fb_writer goes down...\n"); |
| 82 | return 0; |
| 83 | case SPU_START: |
| 84 | break; |
| 85 | default: |
| 86 | deprintf("[SPU] Cannot handle message\n"); |
| 87 | continue; |
| 88 | } |
| 89 | |
| 90 | /* Tag Manager setup */ |
| 91 | unsigned int tag_id; |
| 92 | tag_id = mfc_multi_tag_reserve(1); |
| 93 | if (tag_id == MFC_TAG_INVALID) { |
| 94 | deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n"); |
| 95 | return 0; |
| 96 | } |
| 97 | |
| 98 | /* DMA transfer for the input parameters */ |
| 99 | ea_mfc = spu_read_in_mbox(); |
| 100 | deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc); |
| 101 | spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD); |
| 102 | DMA_WAIT_TAG(tag_id); |
| 103 | |
| 104 | /* There are alignment issues that involve handling of special cases |
| 105 | * a width of 32 results in a width of 16 in the chrominance |
| 106 | * --> choose the proper handling to optimize the performance |
| 107 | */ |
| 108 | deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height); |
| 109 | if (parms_converter.src_pixel_width & 0x1f) { |
| 110 | deprintf("[SPU] Using yuv_to_rgb_w16\n"); |
| 111 | yuv_to_rgb_w16(); |
| 112 | } else { |
| 113 | deprintf("[SPU] Using yuv_to_rgb_w32\n"); |
| 114 | yuv_to_rgb_w32(); |
| 115 | } |
| 116 | |
| 117 | mfc_multi_tag_release(tag_id, 1); |
| 118 | deprintf("[SPU] yuv2rgb_spu... done!\n"); |
| 119 | /* Send FIN message */ |
| 120 | spu_write_out_mbox(SPU_FIN); |
| 121 | } |
| 122 | |
| 123 | return 0; |
| 124 | } |
| 125 | |
| 126 | |
| 127 | /* |
| 128 | * float_to_char() |
| 129 | * |
| 130 | * converts a float to a character using saturated |
| 131 | * arithmetic |
| 132 | * |
| 133 | * @param s float for conversion |
| 134 | * @returns converted character |
| 135 | */ |
| 136 | inline static unsigned char float_to_char(float s) { |
| 137 | vector float vec_s = spu_splats(s); |
| 138 | vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); |
| 139 | vec_s = spu_sel(vec_s, vec_0_1, select_1); |
| 140 | |
| 141 | vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); |
| 142 | vec_s = spu_sel(vec_s, vec_255, select_2); |
| 143 | return (unsigned char) spu_extract(vec_s,0); |
| 144 | } |
| 145 | |
| 146 | |
| 147 | /* |
| 148 | * vfloat_to_vuint() |
| 149 | * |
| 150 | * converts a float vector to an unsinged int vector using saturated |
| 151 | * arithmetic |
| 152 | * |
| 153 | * @param vec_s float vector for conversion |
| 154 | * @returns converted unsigned int vector |
| 155 | */ |
| 156 | inline static vector unsigned int vfloat_to_vuint(vector float vec_s) { |
| 157 | vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); |
| 158 | vec_s = spu_sel(vec_s, vec_0_1, select_1); |
| 159 | |
| 160 | vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); |
| 161 | vec_s = spu_sel(vec_s, vec_255, select_2); |
| 162 | return spu_convtu(vec_s,0); |
| 163 | } |
| 164 | |
| 165 | |
| 166 | void yuv_to_rgb_w16() { |
| 167 | // Pixel dimensions of the picture |
| 168 | uint32_t width, height; |
| 169 | |
| 170 | // Extract parameters |
| 171 | width = parms_converter.src_pixel_width; |
| 172 | height = parms_converter.src_pixel_height; |
| 173 | |
| 174 | // Plane data management |
| 175 | // Y |
| 176 | unsigned char* ram_addr_y = parms_converter.y_plane; |
| 177 | // V |
| 178 | unsigned char* ram_addr_v = parms_converter.v_plane; |
| 179 | // U |
| 180 | unsigned char* ram_addr_u = parms_converter.u_plane; |
| 181 | |
| 182 | // BGRA |
| 183 | unsigned char* ram_addr_bgra = parms_converter.dstBuffer; |
| 184 | |
| 185 | // Strides |
| 186 | unsigned int stride_y = width; |
| 187 | unsigned int stride_vu = width>>1; |
| 188 | |
| 189 | // Buffer management |
| 190 | unsigned int buf_idx = 0; |
| 191 | unsigned int size_4lines_y = stride_y<<2; |
| 192 | unsigned int size_2lines_y = stride_y<<1; |
| 193 | unsigned int size_2lines_vu = stride_vu<<1; |
| 194 | |
| 195 | // 2*width*4byte_per_pixel |
| 196 | unsigned int size_2lines_bgra = width<<3; |
| 197 | |
| 198 | |
| 199 | // start double-buffered processing |
| 200 | // 4 lines y |
| 201 | spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD); |
| 202 | |
| 203 | // 2 lines v |
| 204 | spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); |
| 205 | |
| 206 | // 2 lines u |
| 207 | spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); |
| 208 | |
| 209 | // Wait for these transfers to be completed |
| 210 | DMA_WAIT_TAG((RETR_BUF + buf_idx)); |
| 211 | |
| 212 | unsigned int i; |
| 213 | for(i=0; i<(height>>2)-1; i++) { |
| 214 | |
| 215 | buf_idx^=1; |
| 216 | |
| 217 | // 4 lines y |
| 218 | spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD); |
| 219 | |
| 220 | // 2 lines v |
| 221 | spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); |
| 222 | |
| 223 | // 2 lines u |
| 224 | spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); |
| 225 | |
| 226 | DMA_WAIT_TAG((RETR_BUF + buf_idx)); |
| 227 | |
| 228 | buf_idx^=1; |
| 229 | |
| 230 | |
| 231 | // Convert YUV to BGRA, store it back (first two lines) |
| 232 | yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); |
| 233 | |
| 234 | // Next two lines |
| 235 | yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y, |
| 236 | v_plane[buf_idx] + stride_vu, |
| 237 | u_plane[buf_idx] + stride_vu, |
| 238 | bgra + size_2lines_bgra, |
| 239 | width); |
| 240 | |
| 241 | // Wait for previous storing transfer to be completed |
| 242 | DMA_WAIT_TAG(STR_BUF); |
| 243 | |
| 244 | // Store converted lines in two steps->max transfer size 16384 |
| 245 | spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); |
| 246 | ram_addr_bgra += size_2lines_bgra; |
| 247 | spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); |
| 248 | ram_addr_bgra += size_2lines_bgra; |
| 249 | |
| 250 | // Move 4 lines |
| 251 | ram_addr_y += size_4lines_y; |
| 252 | ram_addr_v += size_2lines_vu; |
| 253 | ram_addr_u += size_2lines_vu; |
| 254 | |
| 255 | buf_idx^=1; |
| 256 | } |
| 257 | |
| 258 | // Convert YUV to BGRA, store it back (first two lines) |
| 259 | yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); |
| 260 | |
| 261 | // Next two lines |
| 262 | yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y, |
| 263 | v_plane[buf_idx] + stride_vu, |
| 264 | u_plane[buf_idx] + stride_vu, |
| 265 | bgra + size_2lines_bgra, |
| 266 | width); |
| 267 | |
| 268 | // Wait for previous storing transfer to be completed |
| 269 | DMA_WAIT_TAG(STR_BUF); |
| 270 | spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); |
| 271 | ram_addr_bgra += size_2lines_bgra; |
| 272 | spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); |
| 273 | |
| 274 | // wait for previous storing transfer to be completed |
| 275 | DMA_WAIT_TAG(STR_BUF); |
| 276 | |
| 277 | } |
| 278 | |
| 279 | |
| 280 | void yuv_to_rgb_w32() { |
| 281 | // Pixel dimensions of the picture |
| 282 | uint32_t width, height; |
| 283 | |
| 284 | // Extract parameters |
| 285 | width = parms_converter.src_pixel_width; |
| 286 | height = parms_converter.src_pixel_height; |
| 287 | |
| 288 | // Plane data management |
| 289 | // Y |
| 290 | unsigned char* ram_addr_y = parms_converter.y_plane; |
| 291 | // V |
| 292 | unsigned char* ram_addr_v = parms_converter.v_plane; |
| 293 | // U |
| 294 | unsigned char* ram_addr_u = parms_converter.u_plane; |
| 295 | |
| 296 | // BGRA |
| 297 | unsigned char* ram_addr_bgra = parms_converter.dstBuffer; |
| 298 | |
| 299 | // Strides |
| 300 | unsigned int stride_y = width; |
| 301 | unsigned int stride_vu = width>>1; |
| 302 | |
| 303 | // Buffer management |
| 304 | unsigned int buf_idx = 0; |
| 305 | unsigned int size_4lines_y = stride_y<<2; |
| 306 | unsigned int size_2lines_y = stride_y<<1; |
| 307 | unsigned int size_2lines_vu = stride_vu<<1; |
| 308 | |
| 309 | // 2*width*4byte_per_pixel |
| 310 | unsigned int size_2lines_bgra = width<<3; |
| 311 | |
| 312 | // start double-buffered processing |
| 313 | // 4 lines y |
| 314 | spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD); |
| 315 | // 2 lines v |
| 316 | spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); |
| 317 | // 2 lines u |
| 318 | spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); |
| 319 | |
| 320 | // Wait for these transfers to be completed |
| 321 | DMA_WAIT_TAG((RETR_BUF + buf_idx)); |
| 322 | |
| 323 | unsigned int i; |
| 324 | for(i=0; i < (height>>2)-1; i++) { |
| 325 | buf_idx^=1; |
| 326 | // 4 lines y |
| 327 | spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD); |
| 328 | deprintf("4lines = %d\n", size_4lines_y); |
| 329 | // 2 lines v |
| 330 | spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); |
| 331 | deprintf("2lines = %d\n", size_2lines_vu); |
| 332 | // 2 lines u |
| 333 | spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); |
| 334 | deprintf("2lines = %d\n", size_2lines_vu); |
| 335 | |
| 336 | DMA_WAIT_TAG((RETR_BUF + buf_idx)); |
| 337 | |
| 338 | buf_idx^=1; |
| 339 | |
| 340 | // Convert YUV to BGRA, store it back (first two lines) |
| 341 | yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); |
| 342 | |
| 343 | // Next two lines |
| 344 | yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y, |
| 345 | v_plane[buf_idx] + stride_vu, |
| 346 | u_plane[buf_idx] + stride_vu, |
| 347 | bgra + size_2lines_bgra, |
| 348 | width); |
| 349 | |
| 350 | // Wait for previous storing transfer to be completed |
| 351 | DMA_WAIT_TAG(STR_BUF); |
| 352 | |
| 353 | // Store converted lines in two steps->max transfer size 16384 |
| 354 | spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); |
| 355 | ram_addr_bgra += size_2lines_bgra; |
| 356 | spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); |
| 357 | ram_addr_bgra += size_2lines_bgra; |
| 358 | |
| 359 | // Move 4 lines |
| 360 | ram_addr_y += size_4lines_y; |
| 361 | ram_addr_v += size_2lines_vu; |
| 362 | ram_addr_u += size_2lines_vu; |
| 363 | |
| 364 | buf_idx^=1; |
| 365 | } |
| 366 | |
| 367 | // Convert YUV to BGRA, store it back (first two lines) |
| 368 | yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); |
| 369 | |
| 370 | // Next two lines |
| 371 | yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y, |
| 372 | v_plane[buf_idx] + stride_vu, |
| 373 | u_plane[buf_idx] + stride_vu, |
| 374 | bgra + size_2lines_bgra, |
| 375 | width); |
| 376 | |
| 377 | // Wait for previous storing transfer to be completed |
| 378 | DMA_WAIT_TAG(STR_BUF); |
| 379 | spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); |
| 380 | ram_addr_bgra += size_2lines_bgra; |
| 381 | spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); |
| 382 | |
| 383 | // Wait for previous storing transfer to be completed |
| 384 | DMA_WAIT_TAG(STR_BUF); |
| 385 | } |
| 386 | |
| 387 | |
| 388 | /* Some vectors needed by the yuv 2 rgb conversion algorithm */ |
| 389 | const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f }; |
| 390 | const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
| 391 | const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 }; |
| 392 | const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 }; |
| 393 | const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B }; |
| 394 | const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F }; |
| 395 | |
| 396 | const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f}; |
| 397 | const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f}; |
| 398 | const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f}; |
| 399 | const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f}; |
| 400 | |
| 401 | const vector unsigned int vec_alpha = { 255 << 24, 255 << 24, 255 << 24, 255 << 24 }; |
| 402 | |
| 403 | const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 }; |
| 404 | const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F }; |
| 405 | |
| 406 | |
| 407 | /* |
| 408 | * yuv_to_rgb_w16() |
| 409 | * |
| 410 | * processes to line of yuv-input, width has to be a multiple of 16 |
| 411 | * two lines of yuv are taken as input |
| 412 | * |
| 413 | * @param y_addr address of the y plane in local store |
| 414 | * @param v_addr address of the v plane in local store |
| 415 | * @param u_addr address of the u plane in local store |
| 416 | * @param bgra_addr_ address of the bgra output buffer |
| 417 | * @param width the width in pixel |
| 418 | */ |
| 419 | void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) { |
| 420 | // each pixel is stored as an integer |
| 421 | unsigned int* bgra_addr = (unsigned int*) bgra_addr_; |
| 422 | |
| 423 | unsigned int x; |
| 424 | for(x = 0; x < width; x+=2) { |
| 425 | // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt |
| 426 | const unsigned char Y_1 = *(y_addr + x); |
| 427 | const unsigned char Y_2 = *(y_addr + x + 1); |
| 428 | const unsigned char Y_3 = *(y_addr + x + width); |
| 429 | const unsigned char Y_4 = *(y_addr + x + width + 1); |
| 430 | const unsigned char U = *(u_addr + (x >> 1)); |
| 431 | const unsigned char V = *(v_addr + (x >> 1)); |
| 432 | |
| 433 | float V_minus_128 = (float)((float)V - 128.0f); |
| 434 | float U_minus_128 = (float)((float)U - 128.0f); |
| 435 | |
| 436 | float R_precalculate = 1.403f * V_minus_128; |
| 437 | float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128); |
| 438 | float B_precalculate = 1.773f * U_minus_128; |
| 439 | |
| 440 | const unsigned char R_1 = float_to_char((Y_1 + R_precalculate)); |
| 441 | const unsigned char R_2 = float_to_char((Y_2 + R_precalculate)); |
| 442 | const unsigned char R_3 = float_to_char((Y_3 + R_precalculate)); |
| 443 | const unsigned char R_4 = float_to_char((Y_4 + R_precalculate)); |
| 444 | const unsigned char G_1 = float_to_char((Y_1 + G_precalculate)); |
| 445 | const unsigned char G_2 = float_to_char((Y_2 + G_precalculate)); |
| 446 | const unsigned char G_3 = float_to_char((Y_3 + G_precalculate)); |
| 447 | const unsigned char G_4 = float_to_char((Y_4 + G_precalculate)); |
| 448 | const unsigned char B_1 = float_to_char((Y_1 + B_precalculate)); |
| 449 | const unsigned char B_2 = float_to_char((Y_2 + B_precalculate)); |
| 450 | const unsigned char B_3 = float_to_char((Y_3 + B_precalculate)); |
| 451 | const unsigned char B_4 = float_to_char((Y_4 + B_precalculate)); |
| 452 | |
| 453 | *(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24); |
| 454 | *(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24); |
| 455 | *(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24); |
| 456 | *(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24); |
| 457 | } |
| 458 | } |
| 459 | |
| 460 | |
| 461 | /* |
| 462 | * yuv_to_rgb_w32() |
| 463 | * |
| 464 | * processes to line of yuv-input, width has to be a multiple of 32 |
| 465 | * two lines of yuv are taken as input |
| 466 | * |
| 467 | * @param y_addr address of the y plane in local store |
| 468 | * @param v_addr address of the v plane in local store |
| 469 | * @param u_addr address of the u plane in local store |
| 470 | * @param bgra_addr_ address of the bgra output buffer |
| 471 | * @param width the width in pixel |
| 472 | */ |
| 473 | void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) { |
| 474 | // each pixel is stored as an integer |
| 475 | unsigned int* bgra_addr = (unsigned int*) bgra_addr_; |
| 476 | |
| 477 | unsigned int x; |
| 478 | for(x = 0; x < width; x+=32) { |
| 479 | // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt |
| 480 | |
| 481 | const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x)); |
| 482 | const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16)); |
| 483 | const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width)); |
| 484 | const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16)); |
| 485 | const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1))); |
| 486 | const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1))); |
| 487 | |
| 488 | const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128); |
| 489 | const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128); |
| 490 | const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128); |
| 491 | const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128); |
| 492 | |
| 493 | const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128); |
| 494 | const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128); |
| 495 | const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128); |
| 496 | const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128); |
| 497 | |
| 498 | vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0); |
| 499 | vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0); |
| 500 | vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0); |
| 501 | vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0); |
| 502 | vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0); |
| 503 | vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0); |
| 504 | vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0); |
| 505 | vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0); |
| 506 | vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0); |
| 507 | vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0); |
| 508 | vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0); |
| 509 | vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0); |
| 510 | vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0); |
| 511 | vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0); |
| 512 | vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0); |
| 513 | vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0); |
| 514 | |
| 515 | const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1); |
| 516 | const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2); |
| 517 | const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3); |
| 518 | const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4); |
| 519 | |
| 520 | const vector float R1_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_upper); |
| 521 | const vector float R2_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_lower); |
| 522 | const vector float R3_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_upper); |
| 523 | const vector float R4_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_lower); |
| 524 | const vector float R5_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_upper); |
| 525 | const vector float R6_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_lower); |
| 526 | const vector float R7_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_upper); |
| 527 | const vector float R8_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_lower); |
| 528 | |
| 529 | |
| 530 | const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff)); |
| 531 | const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff)); |
| 532 | const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff)); |
| 533 | const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff)); |
| 534 | |
| 535 | const vector float G1_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_upper); |
| 536 | const vector float G2_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_lower); |
| 537 | const vector float G3_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_upper); |
| 538 | const vector float G4_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_lower); |
| 539 | const vector float G5_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_upper); |
| 540 | const vector float G6_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_lower); |
| 541 | const vector float G7_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_upper); |
| 542 | const vector float G8_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_lower); |
| 543 | |
| 544 | |
| 545 | const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1); |
| 546 | const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2); |
| 547 | const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3); |
| 548 | const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4); |
| 549 | |
| 550 | const vector float B1_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_upper); |
| 551 | const vector float B2_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_lower); |
| 552 | const vector float B3_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_upper); |
| 553 | const vector float B4_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_lower); |
| 554 | const vector float B5_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_upper); |
| 555 | const vector float B6_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_lower); |
| 556 | const vector float B7_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_upper); |
| 557 | const vector float B8_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_lower); |
| 558 | |
| 559 | |
| 560 | const vector unsigned int R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate)); |
| 561 | const vector unsigned int R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate)); |
| 562 | const vector unsigned int R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate)); |
| 563 | const vector unsigned int R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate)); |
| 564 | const vector unsigned int R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate)); |
| 565 | const vector unsigned int R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate)); |
| 566 | const vector unsigned int R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate)); |
| 567 | const vector unsigned int R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate)); |
| 568 | const vector unsigned int R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate)); |
| 569 | const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate)); |
| 570 | const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate)); |
| 571 | const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate)); |
| 572 | const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate)); |
| 573 | const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate)); |
| 574 | const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate)); |
| 575 | const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate)); |
| 576 | |
| 577 | const vector unsigned int G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate)); |
| 578 | const vector unsigned int G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate)); |
| 579 | const vector unsigned int G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate)); |
| 580 | const vector unsigned int G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate)); |
| 581 | const vector unsigned int G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate)); |
| 582 | const vector unsigned int G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate)); |
| 583 | const vector unsigned int G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate)); |
| 584 | const vector unsigned int G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate)); |
| 585 | const vector unsigned int G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate)); |
| 586 | const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate)); |
| 587 | const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate)); |
| 588 | const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate)); |
| 589 | const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate)); |
| 590 | const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate)); |
| 591 | const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate)); |
| 592 | const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate)); |
| 593 | |
| 594 | const vector unsigned int B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate)); |
| 595 | const vector unsigned int B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate)); |
| 596 | const vector unsigned int B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate)); |
| 597 | const vector unsigned int B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate)); |
| 598 | const vector unsigned int B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate)); |
| 599 | const vector unsigned int B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate)); |
| 600 | const vector unsigned int B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate)); |
| 601 | const vector unsigned int B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate)); |
| 602 | const vector unsigned int B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate)); |
| 603 | const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate)); |
| 604 | const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate)); |
| 605 | const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate)); |
| 606 | const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate)); |
| 607 | const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate)); |
| 608 | const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate)); |
| 609 | const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate)); |
| 610 | |
| 611 | *((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha, B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1))); |
| 612 | *((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha, B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1))); |
| 613 | *((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha, B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1))); |
| 614 | *((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha, B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1))); |
| 615 | *((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha, B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1))); |
| 616 | *((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha, B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1))); |
| 617 | *((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha, B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1))); |
| 618 | *((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha, B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1))); |
| 619 | *((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha, B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1))); |
| 620 | *((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1))); |
| 621 | *((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1))); |
| 622 | *((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1))); |
| 623 | *((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1))); |
| 624 | *((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1))); |
| 625 | *((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1))); |
| 626 | *((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1))); |
| 627 | } |
| 628 | } |
| 629 | |