src/video/ps3/spulibs/yuv2rgb_converter.c

   1 /*
   2  * SDL - Simple DirectMedia Layer
   3  * CELL BE Support for PS3 Framebuffer
   4  * Copyright (C) 2008, 2009 International Business Machines Corporation
   5  *
   6  * This library is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU Lesser General Public License as published
   8  * by the Free Software Foundation; either version 2.1 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
  19  * USA
  20  *
  21  *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
  22  *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
  23  *  SPE code based on research by:
  24  *  Rene Becker
  25  *  Thimo Emmerich
  26  */
  27
  28 #include "spu_common.h"
  29
  30 #include <spu_intrinsics.h>
  31 #include <spu_mfcio.h>
  32
  33 // Debugging
  34 //#define DEBUG
  35
  36 #ifdef DEBUG
  37 #define deprintf(fmt, args... ) \
  38         fprintf( stdout, fmt, ##args ); \
  39         fflush( stdout );
  40 #else
  41 #define deprintf( fmt, args... )
  42 #endif
  43
  44 struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
  45
  46 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
  47  * there might be the need to retrieve misaligned data, adjust
  48  * incoming v and u plane to be able to handle this (add 128)
  49  */
  50 unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
  51 unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
  52 unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
  53
  54 /* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
  55 unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
  56
  57 /* some vectors needed by the float to int conversion */
  58 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
  59 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
  60
  61 void yuv_to_rgb_w16();
  62 void yuv_to_rgb_w32();
  63
  64 void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
  65 void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
  66
  67
  68 int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
  69 {
  70         deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
  71         uint32_t ea_mfc, mbox;
  72         // send ready message
  73         spu_write_out_mbox(SPU_READY);
  74
  75         while (1) {
  76                 /* Check mailbox */
  77                 mbox = spu_read_in_mbox();
  78                 deprintf("[SPU] Message is %u\n", mbox);
  79                 switch (mbox) {
  80                         case SPU_EXIT:
  81                                 deprintf("[SPU] fb_writer goes down...\n");
  82                                 return 0;
  83                         case SPU_START:
  84                                 break;
  85                         default:
  86                                 deprintf("[SPU] Cannot handle message\n");
  87                                 continue;
  88                 }
  89
  90                 /* Tag Manager setup */
  91                 unsigned int tag_id;
  92                 tag_id = mfc_multi_tag_reserve(1);
  93                 if (tag_id == MFC_TAG_INVALID) {
  94                         deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
  95                         return 0;
  96                 }
  97
  98                 /* DMA transfer for the input parameters */
  99                 ea_mfc = spu_read_in_mbox();
 100                 deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
 101                 spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
 102                 DMA_WAIT_TAG(tag_id);
 103
 104                 /* There are alignment issues that involve handling of special cases
 105                  * a width of 32 results in a width of 16 in the chrominance
 106                  * --> choose the proper handling to optimize the performance
 107                  */
 108                 deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
 109                 if (parms_converter.src_pixel_width & 0x1f) {
 110                         deprintf("[SPU] Using yuv_to_rgb_w16\n");
 111                         yuv_to_rgb_w16();
 112                 } else {
 113                         deprintf("[SPU] Using yuv_to_rgb_w32\n");
 114                         yuv_to_rgb_w32();
 115                 }
 116
 117                 mfc_multi_tag_release(tag_id, 1);
 118                 deprintf("[SPU] yuv2rgb_spu... done!\n");
 119                 /* Send FIN message */
 120                 spu_write_out_mbox(SPU_FIN);
 121         }
 122
 123         return 0;
 124 }
 125
 126
 127 /*
 128  * float_to_char()
 129  *
 130  * converts a float to a character using saturated
 131  * arithmetic
 132  *
 133  * @param s float for conversion
 134  * @returns converted character
 135  */
 136 inline static unsigned char float_to_char(float s) {
 137         vector float vec_s = spu_splats(s);
 138         vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
 139         vec_s = spu_sel(vec_s, vec_0_1, select_1);
 140
 141         vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
 142         vec_s = spu_sel(vec_s, vec_255, select_2);
 143         return (unsigned char) spu_extract(vec_s,0);
 144 }
 145
 146
 147 /*
 148  * vfloat_to_vuint()
 149  *
 150  * converts a float vector to an unsinged int vector using saturated
 151  * arithmetic
 152  *
 153  * @param vec_s float vector for conversion
 154  * @returns converted unsigned int vector
 155  */
 156 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
 157         vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
 158         vec_s = spu_sel(vec_s, vec_0_1, select_1);
 159
 160         vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
 161         vec_s = spu_sel(vec_s, vec_255, select_2);
 162         return spu_convtu(vec_s,0);
 163 }
 164
 165
 166 void yuv_to_rgb_w16() {
 167         // Pixel dimensions of the picture
 168         uint32_t width, height;
 169
 170         // Extract parameters
 171         width = parms_converter.src_pixel_width;
 172         height = parms_converter.src_pixel_height;
 173
 174         // Plane data management
 175         // Y
 176         unsigned char* ram_addr_y = parms_converter.y_plane;
 177         // V
 178         unsigned char* ram_addr_v = parms_converter.v_plane;
 179         // U
 180         unsigned char* ram_addr_u = parms_converter.u_plane;
 181
 182         // BGRA
 183         unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
 184
 185         // Strides
 186         unsigned int stride_y = width;
 187         unsigned int stride_vu = width>>1;
 188
 189         // Buffer management
 190         unsigned int buf_idx = 0;
 191         unsigned int size_4lines_y = stride_y<<2;
 192         unsigned int size_2lines_y = stride_y<<1;
 193         unsigned int size_2lines_vu = stride_vu<<1;
 194
 195         // 2*width*4byte_per_pixel
 196         unsigned int size_2lines_bgra = width<<3;
 197
 198
 199         // start double-buffered processing
 200         // 4 lines y
 201         spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
 202
 203         // 2 lines v
 204         spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
 205
 206         // 2 lines u
 207         spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
 208
 209         // Wait for these transfers to be completed
 210         DMA_WAIT_TAG((RETR_BUF + buf_idx));
 211
 212         unsigned int i;
 213         for(i=0; i<(height>>2)-1; i++) {
 214
 215                 buf_idx^=1;
 216
 217                 // 4 lines y
 218                 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
 219
 220                 // 2 lines v
 221                 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
 222
 223                 // 2 lines u
 224                 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
 225
 226                 DMA_WAIT_TAG((RETR_BUF + buf_idx));
 227
 228                 buf_idx^=1;
 229
 230
 231                 // Convert YUV to BGRA, store it back (first two lines)
 232                 yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
 233
 234                 // Next two lines
 235                 yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
 236                                 v_plane[buf_idx] + stride_vu,
 237                                 u_plane[buf_idx] + stride_vu,
 238                                 bgra + size_2lines_bgra,
 239                                 width);
 240
 241                 // Wait for previous storing transfer to be completed
 242                 DMA_WAIT_TAG(STR_BUF);
 243
 244                 // Store converted lines in two steps->max transfer size 16384
 245                 spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
 246                 ram_addr_bgra += size_2lines_bgra;
 247                 spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
 248                 ram_addr_bgra += size_2lines_bgra;
 249
 250                 // Move 4 lines
 251                 ram_addr_y += size_4lines_y;
 252                 ram_addr_v += size_2lines_vu;
 253                 ram_addr_u += size_2lines_vu;
 254
 255                 buf_idx^=1;
 256         }
 257
 258         // Convert YUV to BGRA, store it back (first two lines)
 259         yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
 260
 261         // Next two lines
 262         yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
 263                         v_plane[buf_idx] + stride_vu,
 264                         u_plane[buf_idx] + stride_vu,
 265                         bgra + size_2lines_bgra,
 266                         width);
 267
 268         // Wait for previous storing transfer to be completed
 269         DMA_WAIT_TAG(STR_BUF);
 270         spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
 271         ram_addr_bgra += size_2lines_bgra;
 272         spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
 273
 274         // wait for previous storing transfer to be completed
 275         DMA_WAIT_TAG(STR_BUF);
 276
 277 }
 278
 279
 280 void yuv_to_rgb_w32() {
 281         // Pixel dimensions of the picture
 282         uint32_t width, height;
 283
 284         // Extract parameters
 285         width = parms_converter.src_pixel_width;
 286         height = parms_converter.src_pixel_height;
 287
 288         // Plane data management
 289         // Y
 290         unsigned char* ram_addr_y = parms_converter.y_plane;
 291         // V
 292         unsigned char* ram_addr_v = parms_converter.v_plane;
 293         // U
 294         unsigned char* ram_addr_u = parms_converter.u_plane;
 295
 296         // BGRA
 297         unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
 298
 299         // Strides
 300         unsigned int stride_y = width;
 301         unsigned int stride_vu = width>>1;
 302
 303         // Buffer management
 304         unsigned int buf_idx = 0;
 305         unsigned int size_4lines_y = stride_y<<2;
 306         unsigned int size_2lines_y = stride_y<<1;
 307         unsigned int size_2lines_vu = stride_vu<<1;
 308
 309         // 2*width*4byte_per_pixel
 310         unsigned int size_2lines_bgra = width<<3;
 311
 312         // start double-buffered processing
 313         // 4 lines y
 314         spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
 315         // 2 lines v
 316         spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
 317         // 2 lines u
 318         spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
 319
 320         // Wait for these transfers to be completed
 321         DMA_WAIT_TAG((RETR_BUF + buf_idx));
 322
 323         unsigned int i;
 324         for(i=0; i < (height>>2)-1; i++) {
 325                 buf_idx^=1;
 326                 // 4 lines y
 327                 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
 328                 deprintf("4lines = %d\n", size_4lines_y);
 329                 // 2 lines v
 330                 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
 331                 deprintf("2lines = %d\n", size_2lines_vu);
 332                 // 2 lines u
 333                 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
 334                 deprintf("2lines = %d\n", size_2lines_vu);
 335
 336                 DMA_WAIT_TAG((RETR_BUF + buf_idx));
 337
 338                 buf_idx^=1;
 339
 340                 // Convert YUV to BGRA, store it back (first two lines)
 341                 yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
 342
 343                 // Next two lines
 344                 yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
 345                                 v_plane[buf_idx] + stride_vu,
 346                                 u_plane[buf_idx] + stride_vu,
 347                                 bgra + size_2lines_bgra,
 348                                 width);
 349
 350                 // Wait for previous storing transfer to be completed
 351                 DMA_WAIT_TAG(STR_BUF);
 352
 353                 // Store converted lines in two steps->max transfer size 16384
 354                 spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
 355                 ram_addr_bgra += size_2lines_bgra;
 356                 spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
 357                 ram_addr_bgra += size_2lines_bgra;
 358
 359                 // Move 4 lines
 360                 ram_addr_y += size_4lines_y;
 361                 ram_addr_v += size_2lines_vu;
 362                 ram_addr_u += size_2lines_vu;
 363
 364                 buf_idx^=1;
 365         }
 366
 367         // Convert YUV to BGRA, store it back (first two lines)
 368         yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
 369
 370         // Next two lines
 371         yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
 372                         v_plane[buf_idx] + stride_vu,
 373                         u_plane[buf_idx] + stride_vu,
 374                         bgra + size_2lines_bgra,
 375                         width);
 376
 377         // Wait for previous storing transfer to be completed
 378         DMA_WAIT_TAG(STR_BUF);
 379         spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
 380         ram_addr_bgra += size_2lines_bgra;
 381         spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
 382
 383         // Wait for previous storing transfer to be completed
 384         DMA_WAIT_TAG(STR_BUF);
 385 }
 386
 387
 388 /* Some vectors needed by the yuv 2 rgb conversion algorithm */
 389 const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
 390 const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 391 const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
 392 const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
 393 const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
 394 const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
 395
 396 const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
 397 const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
 398 const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
 399 const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
 400
 401 const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
 402
 403 const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
 404 const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
 405
 406
 407 /*
 408  * yuv_to_rgb_w16()
 409  *
 410  * processes to line of yuv-input, width has to be a multiple of 16
 411  * two lines of yuv are taken as input
 412  *
 413  * @param y_addr address of the y plane in local store
 414  * @param v_addr address of the v plane in local store
 415  * @param u_addr address of the u plane in local store
 416  * @param bgra_addr_ address of the bgra output buffer
 417  * @param width the width in pixel
 418  */
 419 void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
 420         // each pixel is stored as an integer
 421         unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
 422
 423         unsigned int x;
 424         for(x = 0; x < width; x+=2) {
 425                 // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
 426                 const unsigned char Y_1 = *(y_addr + x);
 427                 const unsigned char Y_2 = *(y_addr + x + 1);
 428                 const unsigned char Y_3 = *(y_addr + x + width);
 429                 const unsigned char Y_4 = *(y_addr + x + width + 1);
 430                 const unsigned char U = *(u_addr + (x >> 1));
 431                 const unsigned char V = *(v_addr + (x >> 1));
 432
 433                 float V_minus_128 = (float)((float)V - 128.0f);
 434                 float U_minus_128 = (float)((float)U - 128.0f);
 435
 436                 float R_precalculate = 1.403f * V_minus_128;
 437                 float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
 438                 float B_precalculate = 1.773f * U_minus_128;
 439
 440                 const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
 441                 const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
 442                 const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
 443                 const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
 444                 const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
 445                 const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
 446                 const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
 447                 const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
 448                 const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
 449                 const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
 450                 const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
 451                 const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
 452
 453                 *(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
 454                 *(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
 455                 *(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
 456                 *(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
 457         }
 458 }
 459
 460
 461 /*
 462  * yuv_to_rgb_w32()
 463  *
 464  * processes to line of yuv-input, width has to be a multiple of 32
 465  * two lines of yuv are taken as input
 466  *
 467  * @param y_addr address of the y plane in local store
 468  * @param v_addr address of the v plane in local store
 469  * @param u_addr address of the u plane in local store
 470  * @param bgra_addr_ address of the bgra output buffer
 471  * @param width the width in pixel
 472  */
 473 void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
 474         // each pixel is stored as an integer
 475         unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
 476
 477         unsigned int x;
 478         for(x = 0; x < width; x+=32) {
 479                 // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
 480
 481                 const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
 482                 const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
 483                 const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
 484                 const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
 485                 const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
 486                 const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
 487
 488                 const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
 489                 const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
 490                 const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
 491                 const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
 492
 493                 const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
 494                 const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
 495                 const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
 496                 const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
 497
 498                 vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
 499                 vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
 500                 vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
 501                 vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
 502                 vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
 503                 vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
 504                 vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
 505                 vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
 506                 vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
 507                 vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
 508                 vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
 509                 vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
 510                 vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
 511                 vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
 512                 vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
 513                 vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
 514
 515                 const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
 516                 const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
 517                 const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
 518                 const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
 519
 520                 const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
 521                 const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
 522                 const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
 523                 const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
 524                 const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
 525                 const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
 526                 const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
 527                 const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
 528
 529
 530                 const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
 531                 const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
 532                 const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
 533                 const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
 534
 535                 const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
 536                 const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
 537                 const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
 538                 const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
 539                 const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
 540                 const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
 541                 const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
 542                 const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
 543
 544
 545                 const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
 546                 const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
 547                 const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
 548                 const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
 549
 550                 const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
 551                 const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
 552                 const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
 553                 const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
 554                 const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
 555                 const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
 556                 const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
 557                 const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
 558
 559
 560                 const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
 561                 const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
 562                 const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
 563                 const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
 564                 const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
 565                 const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
 566                 const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
 567                 const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
 568                 const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
 569                 const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
 570                 const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
 571                 const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
 572                 const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
 573                 const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
 574                 const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
 575                 const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
 576
 577                 const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
 578                 const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
 579                 const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
 580                 const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
 581                 const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
 582                 const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
 583                 const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
 584                 const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
 585                 const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
 586                 const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
 587                 const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
 588                 const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
 589                 const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
 590                 const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
 591                 const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
 592                 const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
 593
 594                 const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
 595                 const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
 596                 const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
 597                 const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
 598                 const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
 599                 const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
 600                 const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
 601                 const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
 602                 const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
 603                 const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
 604                 const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
 605                 const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
 606                 const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
 607                 const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
 608                 const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
 609                 const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
 610
 611                 *((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
 612                 *((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
 613                 *((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
 614                 *((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
 615                 *((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
 616                 *((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
 617                 *((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
 618                 *((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
 619                 *((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
 620                 *((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
 621                 *((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
 622                 *((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
 623                 *((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
 624                 *((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
 625                 *((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
 626                 *((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
 627         }
 628 }
 629