src/video/ps3/spulibs/bilin_scaler.c

   1 /*
   2  * SDL - Simple DirectMedia Layer
   3  * CELL BE Support for PS3 Framebuffer
   4  * Copyright (C) 2008, 2009 International Business Machines Corporation
   5  *
   6  * This library is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU Lesser General Public License as published
   8  * by the Free Software Foundation; either version 2.1 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
  19  * USA
  20  *
  21  *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
  22  *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
  23  *  SPE code based on research by:
  24  *  Rene Becker
  25  *  Thimo Emmerich
  26  */
  27
  28 #include "spu_common.h"
  29
  30 #include <spu_intrinsics.h>
  31 #include <spu_mfcio.h>
  32
  33 // Debugging
  34 //#define DEBUG
  35
  36 #ifdef DEBUG
  37 #define deprintf(fmt, args... ) \
  38         fprintf( stdout, fmt, ##args ); \
  39         fflush( stdout );
  40 #else
  41 #define deprintf( fmt, args... )
  42 #endif
  43
  44 struct scale_parms_t parms __attribute__((aligned(128)));
  45
  46 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
  47  * there might be the need to retrieve misaligned data, adjust
  48  * incoming v and u plane to be able to handle this (add 128)
  49  */
  50 unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
  51 unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
  52 unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
  53
  54 /* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
  55 unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
  56 unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
  57 unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
  58
  59 /* some vectors needed by the float to int conversion */
  60 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
  61 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
  62
  63 void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
  64 void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
  65
  66 void scale_srcw16_dstw16();
  67 void scale_srcw16_dstw32();
  68 void scale_srcw32_dstw16();
  69 void scale_srcw32_dstw32();
  70
  71 int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
  72 {
  73         deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
  74         /* DMA transfer for the input parameters */
  75         spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
  76         DMA_WAIT_TAG(TAG_INIT);
  77
  78         deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
  79                         parms.dst_pixel_width, parms.dst_pixel_height);
  80
  81         if(parms.src_pixel_width & 0x1f) {
  82                 if(parms.dst_pixel_width & 0x1F) {
  83                         deprintf("[SPU] Using scale_srcw16_dstw16\n");
  84                         scale_srcw16_dstw16();
  85                 } else {
  86                         deprintf("[SPU] Using scale_srcw16_dstw32\n");
  87                         scale_srcw16_dstw32();
  88                 }
  89         } else {
  90                 if(parms.dst_pixel_width & 0x1F) {
  91                         deprintf("[SPU] Using scale_srcw32_dstw16\n");
  92                         scale_srcw32_dstw16();
  93                 } else {
  94                         deprintf("[SPU] Using scale_srcw32_dstw32\n");
  95                         scale_srcw32_dstw32();
  96                 }
  97         }
  98         deprintf("[SPU] bilin_scaler_spu... done!\n");
  99
 100         return 0;
 101 }
 102
 103
 104 /*
 105  * vfloat_to_vuint()
 106  *
 107  * converts a float vector to an unsinged int vector using saturated
 108  * arithmetic
 109  *
 110  * @param vec_s float vector for conversion
 111  * @returns converted unsigned int vector
 112  */
 113 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
 114         vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
 115         vec_s = spu_sel(vec_s, vec_0_1, select_1);
 116
 117         vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
 118         vec_s = spu_sel(vec_s, vec_255, select_2);
 119         return spu_convtu(vec_s,0);
 120 }
 121
 122
 123 /*
 124  * scale_srcw16_dstw16()
 125  *
 126  * processes an input image of width 16
 127  * scaling is done to a width 16
 128  * result stored in RAM
 129  */
 130 void scale_srcw16_dstw16() {
 131         // extract parameters
 132         unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
 133
 134         unsigned int src_width = parms.src_pixel_width;
 135         unsigned int src_height = parms.src_pixel_height;
 136         unsigned int dst_width = parms.dst_pixel_width;
 137         unsigned int dst_height = parms.dst_pixel_height;
 138
 139         // YVU
 140         unsigned int src_linestride_y = src_width;
 141         unsigned int src_dbl_linestride_y = src_width<<1;
 142         unsigned int src_linestride_vu = src_width>>1;
 143         unsigned int src_dbl_linestride_vu = src_width;
 144
 145         // scaled YVU
 146         unsigned int scaled_src_linestride_y = dst_width;
 147
 148         // ram addresses
 149         unsigned char* src_addr_y = parms.y_plane;
 150         unsigned char* src_addr_v = parms.v_plane;
 151         unsigned char* src_addr_u = parms.u_plane;
 152
 153         // for handling misalignment, addresses are precalculated
 154         unsigned char* precalc_src_addr_v = src_addr_v;
 155         unsigned char* precalc_src_addr_u = src_addr_u;
 156
 157         unsigned int dst_picture_size = dst_width*dst_height;
 158
 159         // Sizes for destination
 160         unsigned int dst_dbl_linestride_y = dst_width<<1;
 161         unsigned int dst_dbl_linestride_vu = dst_width>>1;
 162
 163         // Perform address calculation for Y, V and U in main memory with dst_addr as base
 164         unsigned char* dst_addr_main_memory_y = dst_addr;
 165         unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 166         unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
 167
 168         // calculate scale factors
 169         vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 170         float y_scale = (float)src_height/(float)dst_height;
 171
 172         // double buffered processing
 173         // buffer switching
 174         unsigned int curr_src_idx = 0;
 175         unsigned int curr_dst_idx = 0;
 176         unsigned int next_src_idx, next_dst_idx;
 177
 178         // 2 lines y as output, upper and lowerline
 179         unsigned int curr_interpl_y_upper = 0;
 180         unsigned int next_interpl_y_upper;
 181         unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 182         // only 1 line v/u output, both planes have the same dimension
 183         unsigned int curr_interpl_vu = 0;
 184         unsigned int next_interpl_vu;
 185
 186         // weights, calculated in every loop iteration
 187         vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 188         vector float vf_next_NSweight_y_upper;
 189         vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 190         vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 191         vector float vf_next_NSweight_vu;
 192
 193         // line indices for the src picture
 194         float curr_src_y_upper = 0.0f, next_src_y_upper;
 195         float curr_src_y_lower, next_src_y_lower;
 196         float curr_src_vu = 0.0f, next_src_vu;
 197
 198         // line indices for the dst picture
 199         unsigned int dst_y=0, dst_vu=0;
 200
 201         // offset for the v and u plane to handle misalignement
 202         unsigned int curr_lsoff_v = 0, next_lsoff_v;
 203         unsigned int curr_lsoff_u = 0, next_lsoff_u;
 204
 205         // calculate lower line indices
 206         curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 207         curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 208         // lower line weight
 209         vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
 210
 211
 212         // start partially double buffered processing
 213         // get initial data, 2 sets of y, 1 set v, 1 set u
 214         mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 215         mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 216                         (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 217                         src_dbl_linestride_y,
 218                         RETR_BUF,
 219                         0, 0 );
 220         mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 221         mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 222
 223         /* iteration loop
 224          * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 225          * the scaled output is 2 lines y, 1 line v, 1 line u
 226          * the yuv2rgb-converted output is stored to RAM
 227          */
 228         for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 229                 dst_y = dst_vu<<1;
 230
 231                 // calculate next indices
 232                 next_src_vu = ((float)dst_vu+1)*y_scale;
 233                 next_src_y_upper = ((float)dst_y+2)*y_scale;
 234                 next_src_y_lower = ((float)dst_y+3)*y_scale;
 235
 236                 next_interpl_vu = (unsigned int) next_src_vu;
 237                 next_interpl_y_upper = (unsigned int) next_src_y_upper;
 238                 next_interpl_y_lower = (unsigned int) next_src_y_lower;
 239
 240                 // calculate weight NORTH-SOUTH
 241                 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 242                 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 243                 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
 244
 245                 // get next lines
 246                 next_src_idx = curr_src_idx^1;
 247                 next_dst_idx = curr_dst_idx^1;
 248
 249                 // 4 lines y
 250                 mfc_get( y_plane[next_src_idx],
 251                                 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 252                                 src_dbl_linestride_y,
 253                                 RETR_BUF+next_src_idx,
 254                                 0, 0 );
 255                 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 256                                 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 257                                 src_dbl_linestride_y,
 258                                 RETR_BUF+next_src_idx,
 259                                 0, 0 );
 260
 261                 // 2 lines v
 262                 precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
 263                 next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
 264                 mfc_get( v_plane[next_src_idx],
 265                                 ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
 266                                 src_dbl_linestride_vu+(next_lsoff_v<<1),
 267                                 RETR_BUF+next_src_idx,
 268                                 0, 0 );
 269                 // 2 lines u
 270                 precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
 271                 next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
 272                 mfc_get( u_plane[next_src_idx],
 273                                 ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
 274                                 src_dbl_linestride_vu+(next_lsoff_v<<1),
 275                                 RETR_BUF+next_src_idx,
 276                                 0, 0 );
 277
 278                 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 279
 280                 // scaling
 281                 // work line y_upper
 282                 bilinear_scale_line_w16( y_plane[curr_src_idx],
 283                                 scaled_y_plane[curr_src_idx],
 284                                 dst_width,
 285                                 vf_x_scale,
 286                                 vf_curr_NSweight_y_upper,
 287                                 src_linestride_y );
 288                 // work line y_lower
 289                 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 290                                 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 291                                 dst_width,
 292                                 vf_x_scale,
 293                                 vf_curr_NSweight_y_lower,
 294                                 src_linestride_y );
 295                 // work line v
 296                 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
 297                                 scaled_v_plane[curr_src_idx],
 298                                 dst_width>>1,
 299                                 vf_x_scale,
 300                                 vf_curr_NSweight_vu,
 301                                 src_linestride_vu );
 302                 // work line u
 303                 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
 304                                 scaled_u_plane[curr_src_idx],
 305                                 dst_width>>1,
 306                                 vf_x_scale,
 307                                 vf_curr_NSweight_vu,
 308                                 src_linestride_vu );
 309
 310
 311                 // Store the result back to main memory into a destination buffer in YUV format
 312                 //---------------------------------------------------------------------------------------------
 313                 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 314
 315                 // Perform three DMA transfers to 3 different locations in the main memory!
 316                 // dst_width:   Pixel width of destination image
 317                 // dst_addr:    Destination address in main memory
 318                 // dst_vu:      Counter which is incremented one by one
 319                 // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 320                 mfc_put(        scaled_y_plane[curr_src_idx],                                   // What from local store (addr)
 321                                 (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),   // Destination in main memory (addr)
 322                                 dst_dbl_linestride_y,                                           // Two Y lines (depending on the widht of the destination resolution)
 323                                 STR_BUF+curr_dst_idx,                                           // Tag
 324                                 0, 0 );
 325
 326                 mfc_put(        scaled_v_plane[curr_src_idx],                                   // What from local store (addr)
 327                                 (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
 328                                 dst_dbl_linestride_vu,                                          // Two V lines (depending on the widht of the destination resolution)
 329                                 STR_BUF+curr_dst_idx,                                           // Tag
 330                                 0, 0 );
 331
 332                 mfc_put(        scaled_u_plane[curr_src_idx],                                   // What from local store (addr)
 333                                 (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
 334                                 dst_dbl_linestride_vu,                                          // Two U lines (depending on the widht of the destination resolution)
 335                                 STR_BUF+curr_dst_idx,                                           // Tag
 336                                 0, 0 );
 337                 //---------------------------------------------------------------------------------------------
 338
 339
 340                 // update for next cycle
 341                 curr_src_idx = next_src_idx;
 342                 curr_dst_idx = next_dst_idx;
 343
 344                 curr_interpl_y_upper = next_interpl_y_upper;
 345                 curr_interpl_y_lower = next_interpl_y_lower;
 346                 curr_interpl_vu = next_interpl_vu;
 347
 348                 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 349                 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 350                 vf_curr_NSweight_vu = vf_next_NSweight_vu;
 351
 352                 curr_src_y_upper = next_src_y_upper;
 353                 curr_src_y_lower = next_src_y_lower;
 354                 curr_src_vu = next_src_vu;
 355
 356                 curr_lsoff_v = next_lsoff_v;
 357                 curr_lsoff_u = next_lsoff_u;
 358         }
 359
 360
 361
 362         DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 363
 364         // scaling
 365         // work line y_upper
 366         bilinear_scale_line_w16( y_plane[curr_src_idx],
 367                         scaled_y_plane[curr_src_idx],
 368                         dst_width,
 369                         vf_x_scale,
 370                         vf_curr_NSweight_y_upper,
 371                         src_linestride_y );
 372         // work line y_lower
 373         bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 374                         scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 375                         dst_width,
 376                         vf_x_scale,
 377                         vf_curr_NSweight_y_lower,
 378                         src_linestride_y );
 379         // work line v
 380         bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
 381                         scaled_v_plane[curr_src_idx],
 382                         dst_width>>1,
 383                         vf_x_scale,
 384                         vf_curr_NSweight_vu,
 385                         src_linestride_vu );
 386         // work line u
 387         bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
 388                         scaled_u_plane[curr_src_idx],
 389                         dst_width>>1,
 390                         vf_x_scale,
 391                         vf_curr_NSweight_vu,
 392                         src_linestride_vu );
 393
 394
 395         // Store the result back to main memory into a destination buffer in YUV format
 396         //---------------------------------------------------------------------------------------------
 397         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 398
 399         // Perform three DMA transfers to 3 different locations in the main memory!
 400         // dst_width:   Pixel width of destination image
 401         // dst_addr:    Destination address in main memory
 402         // dst_vu:      Counter which is incremented one by one
 403         // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 404         mfc_put(        scaled_y_plane[curr_src_idx],                                   // What from local store (addr)
 405                         (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),   // Destination in main memory (addr)
 406                         dst_dbl_linestride_y,                                           // Two Y lines (depending on the widht of the destination resolution)
 407                         STR_BUF+curr_dst_idx,                                           // Tag
 408                         0, 0 );
 409
 410         mfc_put(        scaled_v_plane[curr_src_idx],                                   // What from local store (addr)
 411                         (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
 412                         dst_dbl_linestride_vu,                                          // Two V lines (depending on the widht of the destination resolution)
 413                         STR_BUF+curr_dst_idx,                                           // Tag
 414                         0, 0 );
 415
 416         mfc_put(        scaled_u_plane[curr_src_idx],                                   // What from local store (addr)
 417                         (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
 418                         dst_dbl_linestride_vu,                                          // Two U lines (depending on the widht of the destination resolution)
 419                         STR_BUF+curr_dst_idx,                                           // Tag
 420                         0, 0 );
 421
 422         // wait for completion
 423         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 424         //---------------------------------------------------------------------------------------------
 425 }
 426
 427
 428 /*
 429  * scale_srcw16_dstw32()
 430  *
 431  * processes an input image of width 16
 432  * scaling is done to a width 32
 433  * yuv2rgb conversion on a width of 32
 434  * result stored in RAM
 435  */
 436 void scale_srcw16_dstw32() {
 437         // extract parameters
 438         unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
 439
 440         unsigned int src_width = parms.src_pixel_width;
 441         unsigned int src_height = parms.src_pixel_height;
 442         unsigned int dst_width = parms.dst_pixel_width;
 443         unsigned int dst_height = parms.dst_pixel_height;
 444
 445         // YVU
 446         unsigned int src_linestride_y = src_width;
 447         unsigned int src_dbl_linestride_y = src_width<<1;
 448         unsigned int src_linestride_vu = src_width>>1;
 449         unsigned int src_dbl_linestride_vu = src_width;
 450         // scaled YVU
 451         unsigned int scaled_src_linestride_y = dst_width;
 452
 453         // ram addresses
 454         unsigned char* src_addr_y = parms.y_plane;
 455         unsigned char* src_addr_v = parms.v_plane;
 456         unsigned char* src_addr_u = parms.u_plane;
 457
 458         unsigned int dst_picture_size = dst_width*dst_height;
 459
 460         // Sizes for destination
 461         unsigned int dst_dbl_linestride_y = dst_width<<1;
 462         unsigned int dst_dbl_linestride_vu = dst_width>>1;
 463
 464         // Perform address calculation for Y, V and U in main memory with dst_addr as base
 465         unsigned char* dst_addr_main_memory_y = dst_addr;
 466         unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 467         unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
 468
 469
 470         // for handling misalignment, addresses are precalculated
 471         unsigned char* precalc_src_addr_v = src_addr_v;
 472         unsigned char* precalc_src_addr_u = src_addr_u;
 473
 474         // calculate scale factors
 475         vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 476         float y_scale = (float)src_height/(float)dst_height;
 477
 478         // double buffered processing
 479         // buffer switching
 480         unsigned int curr_src_idx = 0;
 481         unsigned int curr_dst_idx = 0;
 482         unsigned int next_src_idx, next_dst_idx;
 483
 484         // 2 lines y as output, upper and lowerline
 485         unsigned int curr_interpl_y_upper = 0;
 486         unsigned int next_interpl_y_upper;
 487         unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 488         // only 1 line v/u output, both planes have the same dimension
 489         unsigned int curr_interpl_vu = 0;
 490         unsigned int next_interpl_vu;
 491
 492         // weights, calculated in every loop iteration
 493         vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 494         vector float vf_next_NSweight_y_upper;
 495         vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 496         vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 497         vector float vf_next_NSweight_vu;
 498
 499         // line indices for the src picture
 500         float curr_src_y_upper = 0.0f, next_src_y_upper;
 501         float curr_src_y_lower, next_src_y_lower;
 502         float curr_src_vu = 0.0f, next_src_vu;
 503
 504         // line indices for the dst picture
 505         unsigned int dst_y=0, dst_vu=0;
 506
 507         // offset for the v and u plane to handle misalignement
 508         unsigned int curr_lsoff_v = 0, next_lsoff_v;
 509         unsigned int curr_lsoff_u = 0, next_lsoff_u;
 510
 511         // calculate lower line idices
 512         curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 513         curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 514         // lower line weight
 515         vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
 516
 517
 518         // start partially double buffered processing
 519         // get initial data, 2 sets of y, 1 set v, 1 set u
 520         mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 521         mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 522                         (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 523                         src_dbl_linestride_y,
 524                         RETR_BUF,
 525                         0, 0 );
 526         mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 527         mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 528
 529         // iteration loop
 530         // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 531         // the scaled output is 2 lines y, 1 line v, 1 line u
 532         // the yuv2rgb-converted output is stored to RAM
 533         for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 534                 dst_y = dst_vu<<1;
 535
 536                 // calculate next indices
 537                 next_src_vu = ((float)dst_vu+1)*y_scale;
 538                 next_src_y_upper = ((float)dst_y+2)*y_scale;
 539                 next_src_y_lower = ((float)dst_y+3)*y_scale;
 540
 541                 next_interpl_vu = (unsigned int) next_src_vu;
 542                 next_interpl_y_upper = (unsigned int) next_src_y_upper;
 543                 next_interpl_y_lower = (unsigned int) next_src_y_lower;
 544
 545                 // calculate weight NORTH-SOUTH
 546                 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 547                 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 548                 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
 549
 550                 // get next lines
 551                 next_src_idx = curr_src_idx^1;
 552                 next_dst_idx = curr_dst_idx^1;
 553
 554                 // 4 lines y
 555                 mfc_get( y_plane[next_src_idx],
 556                                 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 557                                 src_dbl_linestride_y,
 558                                 RETR_BUF+next_src_idx,
 559                                 0, 0 );
 560                 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 561                                 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 562                                 src_dbl_linestride_y,
 563                                 RETR_BUF+next_src_idx,
 564                                 0, 0 );
 565
 566                 // 2 lines v
 567                 precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
 568                 next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
 569                 mfc_get( v_plane[next_src_idx],
 570                                 ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
 571                                 src_dbl_linestride_vu+(next_lsoff_v<<1),
 572                                 RETR_BUF+next_src_idx,
 573                                 0, 0 );
 574                 // 2 lines u
 575                 precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
 576                 next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
 577                 mfc_get( u_plane[next_src_idx],
 578                                 ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
 579                                 src_dbl_linestride_vu+(next_lsoff_v<<1),
 580                                 RETR_BUF+next_src_idx,
 581                                 0, 0 );
 582
 583                 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 584
 585                 // scaling
 586                 // work line y_upper
 587                 bilinear_scale_line_w16( y_plane[curr_src_idx],
 588                                 scaled_y_plane[curr_src_idx],
 589                                 dst_width,
 590                                 vf_x_scale,
 591                                 vf_curr_NSweight_y_upper,
 592                                 src_linestride_y );
 593                 // work line y_lower
 594                 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 595                                 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 596                                 dst_width,
 597                                 vf_x_scale,
 598                                 vf_curr_NSweight_y_lower,
 599                                 src_linestride_y );
 600                 // work line v
 601                 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
 602                                 scaled_v_plane[curr_src_idx],
 603                                 dst_width>>1,
 604                                 vf_x_scale,
 605                                 vf_curr_NSweight_vu,
 606                                 src_linestride_vu );
 607                 // work line u
 608                 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
 609                                 scaled_u_plane[curr_src_idx],
 610                                 dst_width>>1,
 611                                 vf_x_scale,
 612                                 vf_curr_NSweight_vu,
 613                                 src_linestride_vu );
 614
 615                 //---------------------------------------------------------------------------------------------
 616                 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 617
 618                 // Perform three DMA transfers to 3 different locations in the main memory!
 619                 // dst_width:   Pixel width of destination image
 620                 // dst_addr:    Destination address in main memory
 621                 // dst_vu:      Counter which is incremented one by one
 622                 // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 623
 624                 mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
 625                                 (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
 626                                 dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
 627                                 STR_BUF+curr_dst_idx,                                                           // Tag
 628                                 0, 0 );
 629
 630                 mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
 631                                 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 632                                 dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
 633                                 STR_BUF+curr_dst_idx,                                                           // Tag
 634                                 0, 0 );
 635
 636                 mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
 637                                 (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),        // Destination in main memory (addr)
 638                                 dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
 639                                 STR_BUF+curr_dst_idx,                                                           // Tag
 640                                 0, 0 );
 641                 //---------------------------------------------------------------------------------------------
 642
 643
 644                 // update for next cycle
 645                 curr_src_idx = next_src_idx;
 646                 curr_dst_idx = next_dst_idx;
 647
 648                 curr_interpl_y_upper = next_interpl_y_upper;
 649                 curr_interpl_y_lower = next_interpl_y_lower;
 650                 curr_interpl_vu = next_interpl_vu;
 651
 652                 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 653                 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 654                 vf_curr_NSweight_vu = vf_next_NSweight_vu;
 655
 656                 curr_src_y_upper = next_src_y_upper;
 657                 curr_src_y_lower = next_src_y_lower;
 658                 curr_src_vu = next_src_vu;
 659
 660                 curr_lsoff_v = next_lsoff_v;
 661                 curr_lsoff_u = next_lsoff_u;
 662         }
 663
 664
 665
 666         DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 667
 668         // scaling
 669         // work line y_upper
 670         bilinear_scale_line_w16( y_plane[curr_src_idx],
 671                         scaled_y_plane[curr_src_idx],
 672                         dst_width,
 673                         vf_x_scale,
 674                         vf_curr_NSweight_y_upper,
 675                         src_linestride_y );
 676         // work line y_lower
 677         bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 678                         scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 679                         dst_width,
 680                         vf_x_scale,
 681                         vf_curr_NSweight_y_lower,
 682                         src_linestride_y );
 683         // work line v
 684         bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
 685                         scaled_v_plane[curr_src_idx],
 686                         dst_width>>1,
 687                         vf_x_scale,
 688                         vf_curr_NSweight_vu,
 689                         src_linestride_vu );
 690         // work line u
 691         bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
 692                         scaled_u_plane[curr_src_idx],
 693                         dst_width>>1,
 694                         vf_x_scale,
 695                         vf_curr_NSweight_vu,
 696                         src_linestride_vu );
 697
 698         //---------------------------------------------------------------------------------------------
 699         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 700
 701         // Perform three DMA transfers to 3 different locations in the main memory!
 702         // dst_width:   Pixel width of destination image
 703         // dst_addr:    Destination address in main memory
 704         // dst_vu:      Counter which is incremented one by one
 705         // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 706
 707         mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
 708                         (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
 709                         dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
 710                         STR_BUF+curr_dst_idx,                                                           // Tag
 711                         0, 0 );
 712
 713         mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
 714                         (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 715                         dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
 716                         STR_BUF+curr_dst_idx,                                                           // Tag
 717                         0, 0 );
 718
 719         mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
 720                         (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),        // Destination in main memory (addr)
 721                         dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
 722                         STR_BUF+curr_dst_idx,                                                           // Tag
 723                         0, 0 );
 724
 725         // wait for completion
 726         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 727         //---------------------------------------------------------------------------------------------
 728 }
 729
 730
 731 /*
 732  * scale_srcw32_dstw16()
 733  *
 734  * processes an input image of width 32
 735  * scaling is done to a width 16
 736  * yuv2rgb conversion on a width of 16
 737  * result stored in RAM
 738  */
 739 void scale_srcw32_dstw16() {
 740         // extract parameters
 741         unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
 742
 743         unsigned int src_width = parms.src_pixel_width;
 744         unsigned int src_height = parms.src_pixel_height;
 745         unsigned int dst_width = parms.dst_pixel_width;
 746         unsigned int dst_height = parms.dst_pixel_height;
 747
 748         // YVU
 749         unsigned int src_linestride_y = src_width;
 750         unsigned int src_dbl_linestride_y = src_width<<1;
 751         unsigned int src_linestride_vu = src_width>>1;
 752         unsigned int src_dbl_linestride_vu = src_width;
 753         // scaled YVU
 754         unsigned int scaled_src_linestride_y = dst_width;
 755
 756         // ram addresses
 757         unsigned char* src_addr_y = parms.y_plane;
 758         unsigned char* src_addr_v = parms.v_plane;
 759         unsigned char* src_addr_u = parms.u_plane;
 760
 761         unsigned int dst_picture_size = dst_width*dst_height;
 762
 763         // Sizes for destination
 764         unsigned int dst_dbl_linestride_y = dst_width<<1;
 765         unsigned int dst_dbl_linestride_vu = dst_width>>1;
 766
 767         // Perform address calculation for Y, V and U in main memory with dst_addr as base
 768         unsigned char* dst_addr_main_memory_y = dst_addr;
 769         unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 770         unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
 771
 772         // calculate scale factors
 773         vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 774         float y_scale = (float)src_height/(float)dst_height;
 775
 776         // double buffered processing
 777         // buffer switching
 778         unsigned int curr_src_idx = 0;
 779         unsigned int curr_dst_idx = 0;
 780         unsigned int next_src_idx, next_dst_idx;
 781
 782         // 2 lines y as output, upper and lowerline
 783         unsigned int curr_interpl_y_upper = 0;
 784         unsigned int next_interpl_y_upper;
 785         unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 786         // only 1 line v/u output, both planes have the same dimension
 787         unsigned int curr_interpl_vu = 0;
 788         unsigned int next_interpl_vu;
 789
 790         // weights, calculated in every loop iteration
 791         vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 792         vector float vf_next_NSweight_y_upper;
 793         vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 794         vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 795         vector float vf_next_NSweight_vu;
 796
 797         // line indices for the src picture
 798         float curr_src_y_upper = 0.0f, next_src_y_upper;
 799         float curr_src_y_lower, next_src_y_lower;
 800         float curr_src_vu = 0.0f, next_src_vu;
 801
 802         // line indices for the dst picture
 803         unsigned int dst_y=0, dst_vu=0;
 804
 805         // calculate lower line idices
 806         curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 807         curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 808         // lower line weight
 809         vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
 810
 811
 812         // start partially double buffered processing
 813         // get initial data, 2 sets of y, 1 set v, 1 set u
 814         mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 815         mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 816                         (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 817                         src_dbl_linestride_y,
 818                         RETR_BUF,
 819                         0, 0 );
 820         mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 821         mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 822
 823         // iteration loop
 824         // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 825         // the scaled output is 2 lines y, 1 line v, 1 line u
 826         // the yuv2rgb-converted output is stored to RAM
 827         for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 828                 dst_y = dst_vu<<1;
 829
 830                 // calculate next indices
 831                 next_src_vu = ((float)dst_vu+1)*y_scale;
 832                 next_src_y_upper = ((float)dst_y+2)*y_scale;
 833                 next_src_y_lower = ((float)dst_y+3)*y_scale;
 834
 835                 next_interpl_vu = (unsigned int) next_src_vu;
 836                 next_interpl_y_upper = (unsigned int) next_src_y_upper;
 837                 next_interpl_y_lower = (unsigned int) next_src_y_lower;
 838
 839                 // calculate weight NORTH-SOUTH
 840                 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 841                 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 842                 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
 843
 844                 // get next lines
 845                 next_src_idx = curr_src_idx^1;
 846                 next_dst_idx = curr_dst_idx^1;
 847
 848                 // 4 lines y
 849                 mfc_get( y_plane[next_src_idx],
 850                                 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 851                                 src_dbl_linestride_y,
 852                                 RETR_BUF+next_src_idx,
 853                                 0, 0 );
 854                 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 855                                 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 856                                 src_dbl_linestride_y,
 857                                 RETR_BUF+next_src_idx,
 858                                 0, 0 );
 859
 860                 // 2 lines v
 861                 mfc_get( v_plane[next_src_idx],
 862                                 (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
 863                                 src_dbl_linestride_vu,
 864                                 RETR_BUF+next_src_idx,
 865                                 0, 0 );
 866                 // 2 lines u
 867                 mfc_get( u_plane[next_src_idx],
 868                                 (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
 869                                 src_dbl_linestride_vu,
 870                                 RETR_BUF+next_src_idx,
 871                                 0, 0 );
 872
 873                 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 874
 875                 // scaling
 876                 // work line y_upper
 877                 bilinear_scale_line_w16( y_plane[curr_src_idx],
 878                                 scaled_y_plane[curr_src_idx],
 879                                 dst_width,
 880                                 vf_x_scale,
 881                                 vf_curr_NSweight_y_upper,
 882                                 src_linestride_y );
 883                 // work line y_lower
 884                 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 885                                 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 886                                 dst_width,
 887                                 vf_x_scale,
 888                                 vf_curr_NSweight_y_lower,
 889                                 src_linestride_y );
 890                 // work line v
 891                 bilinear_scale_line_w16( v_plane[curr_src_idx],
 892                                 scaled_v_plane[curr_src_idx],
 893                                 dst_width>>1,
 894                                 vf_x_scale,
 895                                 vf_curr_NSweight_vu,
 896                                 src_linestride_vu );
 897                 // work line u
 898                 bilinear_scale_line_w16( u_plane[curr_src_idx],
 899                                 scaled_u_plane[curr_src_idx],
 900                                 dst_width>>1,
 901                                 vf_x_scale,
 902                                 vf_curr_NSweight_vu,
 903                                 src_linestride_vu );
 904
 905                 //---------------------------------------------------------------------------------------------
 906                 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 907
 908                 // Perform three DMA transfers to 3 different locations in the main memory!
 909                 // dst_width:   Pixel width of destination image
 910                 // dst_addr:    Destination address in main memory
 911                 // dst_vu:      Counter which is incremented one by one
 912                 // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 913
 914                 mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
 915                                 (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
 916                                 dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
 917                                 STR_BUF+curr_dst_idx,                                                           // Tag
 918                                 0, 0 );
 919
 920                 mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
 921                                 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 922                                 dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
 923                                 STR_BUF+curr_dst_idx,                                                           // Tag
 924                                 0, 0 );
 925
 926                 mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
 927                                 (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 928                                 dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
 929                                 STR_BUF+curr_dst_idx,                                                           // Tag
 930                                 0, 0 );
 931                 //---------------------------------------------------------------------------------------------
 932
 933
 934                 // update for next cycle
 935                 curr_src_idx = next_src_idx;
 936                 curr_dst_idx = next_dst_idx;
 937
 938                 curr_interpl_y_upper = next_interpl_y_upper;
 939                 curr_interpl_y_lower = next_interpl_y_lower;
 940                 curr_interpl_vu = next_interpl_vu;
 941
 942                 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 943                 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 944                 vf_curr_NSweight_vu = vf_next_NSweight_vu;
 945
 946                 curr_src_y_upper = next_src_y_upper;
 947                 curr_src_y_lower = next_src_y_lower;
 948                 curr_src_vu = next_src_vu;
 949         }
 950
 951
 952
 953         DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 954
 955         // scaling
 956         // work line y_upper
 957         bilinear_scale_line_w16( y_plane[curr_src_idx],
 958                         scaled_y_plane[curr_src_idx],
 959                         dst_width,
 960                         vf_x_scale,
 961                         vf_curr_NSweight_y_upper,
 962                         src_linestride_y );
 963         // work line y_lower
 964         bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 965                         scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 966                         dst_width,
 967                         vf_x_scale,
 968                         vf_curr_NSweight_y_lower,
 969                         src_linestride_y );
 970         // work line v
 971         bilinear_scale_line_w16( v_plane[curr_src_idx],
 972                         scaled_v_plane[curr_src_idx],
 973                         dst_width>>1,
 974                         vf_x_scale,
 975                         vf_curr_NSweight_vu,
 976                         src_linestride_vu );
 977         // work line u
 978         bilinear_scale_line_w16( u_plane[curr_src_idx],
 979                         scaled_u_plane[curr_src_idx],
 980                         dst_width>>1,
 981                         vf_x_scale,
 982                         vf_curr_NSweight_vu,
 983                         src_linestride_vu );
 984
 985
 986         //---------------------------------------------------------------------------------------------
 987         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 988
 989         // Perform three DMA transfers to 3 different locations in the main memory!
 990         // dst_width:   Pixel width of destination image
 991         // dst_addr:    Destination address in main memory
 992         // dst_vu:      Counter which is incremented one by one
 993         // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 994
 995         mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
 996                         (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
 997                         dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
 998                         STR_BUF+curr_dst_idx,                                                           // Tag
 999                         0, 0 );
1000
1001         mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
1002                         (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1003                         dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
1004                         STR_BUF+curr_dst_idx,                                                           // Tag
1005                         0, 0 );
1006
1007         mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
1008                         (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1009                         dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
1010                         STR_BUF+curr_dst_idx,                                                           // Tag
1011                         0, 0 );
1012
1013         // wait for completion
1014         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1015         //---------------------------------------------------------------------------------------------
1016 }
1017
1018
1019 /**
1020  * scale_srcw32_dstw32()
1021  *
1022  * processes an input image of width 32
1023  * scaling is done to a width 32
1024  * yuv2rgb conversion on a width of 32
1025  * result stored in RAM
1026  */
1027 void scale_srcw32_dstw32() {
1028         // extract parameters
1029         unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
1030
1031         unsigned int src_width = parms.src_pixel_width;
1032         unsigned int src_height = parms.src_pixel_height;
1033         unsigned int dst_width = parms.dst_pixel_width;
1034         unsigned int dst_height = parms.dst_pixel_height;
1035
1036         // YVU
1037         unsigned int src_linestride_y = src_width;
1038         unsigned int src_dbl_linestride_y = src_width<<1;
1039         unsigned int src_linestride_vu = src_width>>1;
1040         unsigned int src_dbl_linestride_vu = src_width;
1041
1042         // scaled YVU
1043         unsigned int scaled_src_linestride_y = dst_width;
1044
1045         // ram addresses
1046         unsigned char* src_addr_y = parms.y_plane;
1047         unsigned char* src_addr_v = parms.v_plane;
1048         unsigned char* src_addr_u = parms.u_plane;
1049
1050         unsigned int dst_picture_size = dst_width*dst_height;
1051
1052         // Sizes for destination
1053         unsigned int dst_dbl_linestride_y = dst_width<<1;
1054         unsigned int dst_dbl_linestride_vu = dst_width>>1;
1055
1056         // Perform address calculation for Y, V and U in main memory with dst_addr as base
1057         unsigned char* dst_addr_main_memory_y = dst_addr;
1058         unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
1059         unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
1060
1061         // calculate scale factors
1062         vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
1063         float y_scale = (float)src_height/(float)dst_height;
1064
1065         // double buffered processing
1066         // buffer switching
1067         unsigned int curr_src_idx = 0;
1068         unsigned int curr_dst_idx = 0;
1069         unsigned int next_src_idx, next_dst_idx;
1070
1071         // 2 lines y as output, upper and lowerline
1072         unsigned int curr_interpl_y_upper = 0;
1073         unsigned int next_interpl_y_upper;
1074         unsigned int curr_interpl_y_lower, next_interpl_y_lower;
1075         // only 1 line v/u output, both planes have the same dimension
1076         unsigned int curr_interpl_vu = 0;
1077         unsigned int next_interpl_vu;
1078
1079         // weights, calculated in every loop iteration
1080         vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
1081         vector float vf_next_NSweight_y_upper;
1082         vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
1083         vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
1084         vector float vf_next_NSweight_vu;
1085
1086         // line indices for the src picture
1087         float curr_src_y_upper = 0.0f, next_src_y_upper;
1088         float curr_src_y_lower, next_src_y_lower;
1089         float curr_src_vu = 0.0f, next_src_vu;
1090
1091         // line indices for the dst picture
1092         unsigned int dst_y=0, dst_vu=0;
1093
1094         // calculate lower line idices
1095         curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
1096         curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
1097         // lower line weight
1098         vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
1099
1100
1101         // start partially double buffered processing
1102         // get initial data, 2 sets of y, 1 set v, 1 set u
1103         mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
1104         mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
1105                         (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
1106                         src_dbl_linestride_y,
1107                         RETR_BUF,
1108                         0, 0 );
1109         mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
1110         mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
1111
1112         // iteration loop
1113         // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
1114         // the scaled output is 2 lines y, 1 line v, 1 line u
1115         // the yuv2rgb-converted output is stored to RAM
1116         for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
1117                 dst_y = dst_vu<<1;
1118
1119                 // calculate next indices
1120                 next_src_vu = ((float)dst_vu+1)*y_scale;
1121                 next_src_y_upper = ((float)dst_y+2)*y_scale;
1122                 next_src_y_lower = ((float)dst_y+3)*y_scale;
1123
1124                 next_interpl_vu = (unsigned int) next_src_vu;
1125                 next_interpl_y_upper = (unsigned int) next_src_y_upper;
1126                 next_interpl_y_lower = (unsigned int) next_src_y_lower;
1127
1128                 // calculate weight NORTH-SOUTH
1129                 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
1130                 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
1131                 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
1132
1133                 // get next lines
1134                 next_src_idx = curr_src_idx^1;
1135                 next_dst_idx = curr_dst_idx^1;
1136
1137                 // 4 lines y
1138                 mfc_get( y_plane[next_src_idx],
1139                                 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
1140                                 src_dbl_linestride_y,
1141                                 RETR_BUF+next_src_idx,
1142                                 0, 0 );
1143                 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
1144                                 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
1145                                 src_dbl_linestride_y,
1146                                 RETR_BUF+next_src_idx,
1147                                 0, 0 );
1148
1149                 // 2 lines v
1150                 mfc_get( v_plane[next_src_idx],
1151                                 (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
1152                                 src_dbl_linestride_vu,
1153                                 RETR_BUF+next_src_idx,
1154                                 0, 0 );
1155                 // 2 lines u
1156                 mfc_get( u_plane[next_src_idx],
1157                                 (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
1158                                 src_dbl_linestride_vu,
1159                                 RETR_BUF+next_src_idx,
1160                                 0, 0 );
1161
1162                 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
1163
1164                 // scaling
1165                 // work line y_upper
1166                 bilinear_scale_line_w16( y_plane[curr_src_idx],
1167                                 scaled_y_plane[curr_src_idx],
1168                                 dst_width,
1169                                 vf_x_scale,
1170                                 vf_curr_NSweight_y_upper,
1171                                 src_linestride_y );
1172                 // work line y_lower
1173                 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
1174                                 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
1175                                 dst_width,
1176                                 vf_x_scale,
1177                                 vf_curr_NSweight_y_lower,
1178                                 src_linestride_y );
1179                 // work line v
1180                 bilinear_scale_line_w16( v_plane[curr_src_idx],
1181                                 scaled_v_plane[curr_src_idx],
1182                                 dst_width>>1,
1183                                 vf_x_scale,
1184                                 vf_curr_NSweight_vu,
1185                                 src_linestride_vu );
1186                 // work line u
1187                 bilinear_scale_line_w16( u_plane[curr_src_idx],
1188                                 scaled_u_plane[curr_src_idx],
1189                                 dst_width>>1,
1190                                 vf_x_scale,
1191                                 vf_curr_NSweight_vu,
1192                                 src_linestride_vu );
1193
1194
1195
1196                 // Store the result back to main memory into a destination buffer in YUV format
1197                 //---------------------------------------------------------------------------------------------
1198                 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1199
1200                 // Perform three DMA transfers to 3 different locations in the main memory!
1201                 // dst_width:   Pixel width of destination image
1202                 // dst_addr:    Destination address in main memory
1203                 // dst_vu:      Counter which is incremented one by one
1204                 // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
1205
1206                 mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
1207                                 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),  // Destination in main memory (addr)
1208                                 dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
1209                                 STR_BUF+curr_dst_idx,                                                           // Tag
1210                                 0, 0 );
1211
1212                 mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
1213                                 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1214                                 dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
1215                                 STR_BUF+curr_dst_idx,                                                           // Tag
1216                                 0, 0 );
1217
1218                 mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
1219                                 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1220                                 dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
1221                                 STR_BUF+curr_dst_idx,                                                           // Tag
1222                                 0, 0 );
1223                 //---------------------------------------------------------------------------------------------
1224
1225
1226                 // update for next cycle
1227                 curr_src_idx = next_src_idx;
1228                 curr_dst_idx = next_dst_idx;
1229
1230                 curr_interpl_y_upper = next_interpl_y_upper;
1231                 curr_interpl_y_lower = next_interpl_y_lower;
1232                 curr_interpl_vu = next_interpl_vu;
1233
1234                 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
1235                 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
1236                 vf_curr_NSweight_vu = vf_next_NSweight_vu;
1237
1238                 curr_src_y_upper = next_src_y_upper;
1239                 curr_src_y_lower = next_src_y_lower;
1240                 curr_src_vu = next_src_vu;
1241         }
1242
1243
1244
1245         DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
1246
1247         // scaling
1248         // work line y_upper
1249         bilinear_scale_line_w16( y_plane[curr_src_idx],
1250                         scaled_y_plane[curr_src_idx],
1251                         dst_width,
1252                         vf_x_scale,
1253                         vf_curr_NSweight_y_upper,
1254                         src_linestride_y );
1255         // work line y_lower
1256         bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
1257                         scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
1258                         dst_width,
1259                         vf_x_scale,
1260                         vf_curr_NSweight_y_lower,
1261                         src_linestride_y );
1262         // work line v
1263         bilinear_scale_line_w16( v_plane[curr_src_idx],
1264                         scaled_v_plane[curr_src_idx],
1265                         dst_width>>1,
1266                         vf_x_scale,
1267                         vf_curr_NSweight_vu,
1268                         src_linestride_vu );
1269         // work line u
1270         bilinear_scale_line_w16( u_plane[curr_src_idx],
1271                         scaled_u_plane[curr_src_idx],
1272                         dst_width>>1,
1273                         vf_x_scale,
1274                         vf_curr_NSweight_vu,
1275                         src_linestride_vu );
1276
1277
1278         // Store the result back to main memory into a destination buffer in YUV format
1279         //---------------------------------------------------------------------------------------------
1280         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1281
1282         // Perform three DMA transfers to 3 different locations in the main memory!
1283         // dst_width:   Pixel width of destination image
1284         // dst_addr:    Destination address in main memory
1285         // dst_vu:      Counter which is incremented one by one
1286         // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
1287
1288         mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
1289                         (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
1290                         dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
1291                         STR_BUF+curr_dst_idx,                                                           // Tag
1292                         0, 0 );
1293
1294         mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
1295                         (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1296                         dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
1297                         STR_BUF+curr_dst_idx,                                                           // Tag
1298                         0, 0 );
1299
1300         mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
1301                         (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1302                         dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
1303                         STR_BUF+curr_dst_idx,                                                           // Tag
1304                         0, 0 );
1305
1306         // wait for completion
1307         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1308         //---------------------------------------------------------------------------------------------
1309 }
1310
1311
1312 /*
1313  * bilinear_scale_line_w8()
1314  *
1315  * processes a line of yuv-input, width has to be a multiple of 8
1316  * scaled yuv-output is written to local store buffer
1317  *
1318  * @param src buffer for 2 lines input
1319  * @param dst_ buffer for 1 line output
1320  * @param dst_width the width of the destination line
1321  * @param vf_x_scale a float vector, at each entry is the x_scale-factor
1322  * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
1323  * @param src_linestride the stride of the srcline
1324  */
1325 void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
1326
1327         unsigned char* dst = dst_;
1328
1329         unsigned int dst_x;
1330         for( dst_x=0; dst_x<dst_width; dst_x+=8) {
1331                 // address calculation for loading the 4 surrounding pixel of each calculated
1332                 // destination pixel
1333                 vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
1334                 // lower range->first 4 pixel
1335                 // upper range->next 4 pixel
1336                 vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
1337                 vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
1338                 vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
1339                 vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
1340
1341                 // calculate weight EAST-WEST
1342                 vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
1343                 vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
1344                 vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
1345                 vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
1346                 vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
1347                 vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
1348                 vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
1349                 vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
1350                 vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
1351                 vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
1352
1353                 // calculate address offset
1354                 //
1355                 // pixel NORTH WEST
1356                 vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
1357                 vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
1358
1359                 // pixel NORTH EAST-->(offpixelNW+1)
1360                 vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
1361                 vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
1362                 vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
1363
1364                 // SOUTH-WEST-->(offpixelNW+src_linestride)
1365                 vector unsigned int vui_srclinestride = spu_splats( src_linestride );
1366                 vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
1367                 vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
1368
1369                 // SOUTH-EAST-->(offpixelNW+src_linestride+1)
1370                 vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
1371                 vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
1372
1373                 // calculate each address
1374                 vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
1375                 vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
1376                 vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
1377                 vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
1378                 vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
1379
1380                 vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
1381                 vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
1382                 vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
1383                 vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
1384
1385                 // get each pixel
1386                 //
1387                 // scalar load, afterwards insertion into the right position
1388                 // NORTH WEST
1389                 vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
1390                 vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
1391                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
1392                 vuc_pixel_NW_lower_range = spu_insert(
1393                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
1394                                 vuc_pixel_NW_lower_range, 7 );
1395                 vuc_pixel_NW_lower_range = spu_insert(
1396                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
1397                                 vuc_pixel_NW_lower_range, 11 );
1398                 vuc_pixel_NW_lower_range = spu_insert(
1399                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
1400                                 vuc_pixel_NW_lower_range, 15 );
1401
1402                 vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
1403                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
1404                 vuc_pixel_NW_upper_range = spu_insert(
1405                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
1406                                 vuc_pixel_NW_upper_range, 7 );
1407                 vuc_pixel_NW_upper_range = spu_insert(
1408                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
1409                                 vuc_pixel_NW_upper_range, 11 );
1410                 vuc_pixel_NW_upper_range = spu_insert(
1411                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
1412                                 vuc_pixel_NW_upper_range, 15 );
1413
1414                 // NORTH EAST
1415                 vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
1416                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
1417                 vuc_pixel_NE_lower_range = spu_insert(
1418                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
1419                                 vuc_pixel_NE_lower_range, 7 );
1420                 vuc_pixel_NE_lower_range = spu_insert(
1421                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
1422                                 vuc_pixel_NE_lower_range, 11 );
1423                 vuc_pixel_NE_lower_range = spu_insert(
1424                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
1425                                 vuc_pixel_NE_lower_range, 15 );
1426
1427                 vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
1428                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
1429                 vuc_pixel_NE_upper_range = spu_insert(
1430                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
1431                                 vuc_pixel_NE_upper_range, 7 );
1432                 vuc_pixel_NE_upper_range = spu_insert(
1433                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
1434                                 vuc_pixel_NE_upper_range, 11 );
1435                 vuc_pixel_NE_upper_range = spu_insert(
1436                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
1437                                 vuc_pixel_NE_upper_range, 15 );
1438
1439
1440                 // SOUTH WEST
1441                 vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
1442                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
1443                 vuc_pixel_SW_lower_range = spu_insert(
1444                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
1445                                 vuc_pixel_SW_lower_range, 7 );
1446                 vuc_pixel_SW_lower_range = spu_insert(
1447                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
1448                                 vuc_pixel_SW_lower_range, 11 );
1449                 vuc_pixel_SW_lower_range = spu_insert(
1450                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
1451                                 vuc_pixel_SW_lower_range, 15 );
1452
1453                 vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
1454                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
1455                 vuc_pixel_SW_upper_range = spu_insert(
1456                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
1457                                 vuc_pixel_SW_upper_range, 7 );
1458                 vuc_pixel_SW_upper_range = spu_insert(
1459                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
1460                                 vuc_pixel_SW_upper_range, 11 );
1461                 vuc_pixel_SW_upper_range = spu_insert(
1462                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
1463                                 vuc_pixel_SW_upper_range, 15 );
1464
1465                 // SOUTH EAST
1466                 vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
1467                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
1468                 vuc_pixel_SE_lower_range = spu_insert(
1469                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
1470                                 vuc_pixel_SE_lower_range, 7 );
1471                 vuc_pixel_SE_lower_range = spu_insert(
1472                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
1473                                 vuc_pixel_SE_lower_range, 11 );
1474                 vuc_pixel_SE_lower_range = spu_insert(
1475                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
1476                                 vuc_pixel_SE_lower_range, 15 );
1477
1478                 vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
1479                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
1480                 vuc_pixel_SE_upper_range = spu_insert(
1481                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
1482                                 vuc_pixel_SE_upper_range, 7 );
1483                 vuc_pixel_SE_upper_range = spu_insert(
1484                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
1485                                 vuc_pixel_SE_upper_range, 11 );
1486                 vuc_pixel_SE_upper_range = spu_insert(
1487                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
1488                                 vuc_pixel_SE_upper_range, 15 );
1489
1490
1491                 // convert to float
1492                 vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
1493                 vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
1494
1495                 vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
1496                 vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
1497
1498                 vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
1499                 vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
1500
1501                 vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
1502                 vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
1503
1504
1505
1506                 // first linear interpolation: EWtop
1507                 // EWtop = NW + EWweight*(NE-NW)
1508                 //
1509                 // lower range
1510                 vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
1511                 vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
1512                                                                 vf_EWtop_lower_range_tmp,
1513                                                                 vf_pixel_NW_lower_range );
1514
1515                 // upper range
1516                 vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
1517                 vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
1518                                                                 vf_EWtop_upper_range_tmp,
1519                                                                 vf_pixel_NW_upper_range );
1520
1521
1522
1523                 // second linear interpolation: EWbottom
1524                 // EWbottom = SW + EWweight*(SE-SW)
1525                 //
1526                 // lower range
1527                 vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
1528                 vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
1529                                                                 vf_EWbottom_lower_range_tmp,
1530                                                                 vf_pixel_SW_lower_range );
1531
1532                 // upper range
1533                 vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
1534                 vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
1535                                                                 vf_EWbottom_upper_range_tmp,
1536                                                                 vf_pixel_SW_upper_range );
1537
1538
1539
1540                 // third linear interpolation: the bilinear interpolated value
1541                 // result = EWtop + NSweight*(EWbottom-EWtop);
1542                 //
1543                 // lower range
1544                 vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
1545                 vector float vf_result_lower_range = spu_madd( vf_NSweight,
1546                                                                 vf_result_lower_range_tmp,
1547                                                                 vf_EWtop_lower_range );
1548
1549                 // upper range
1550                 vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
1551                 vector float vf_result_upper_range = spu_madd( vf_NSweight,
1552                                                                 vf_result_upper_range_tmp,
1553                                                                 vf_EWtop_upper_range );
1554
1555
1556                 // convert back: using saturated arithmetic
1557                 vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
1558                 vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
1559
1560                 // merge results->lower,upper
1561                 vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
1562                                                                0x13, 0x17, 0x1B, 0x1F,
1563                                                                0x00, 0x00, 0x00, 0x00,
1564                                                                0x00, 0x00, 0x00, 0x00 };
1565
1566                 vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
1567                                                                 (vector unsigned char) vui_result_upper_range,
1568                                                                 vuc_mask_merge_result );
1569
1570                 // partial storing
1571                 vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
1572                                                       0x00, 0x00, 0x00, 0x00,
1573                                                       0xFF, 0xFF, 0xFF, 0xFF,
1574                                                       0xFF, 0xFF, 0xFF, 0xFF };
1575
1576
1577                 // get currently stored data
1578                 vector unsigned char vuc_orig = *((vector unsigned char*)dst);
1579
1580                 // clear currently stored data
1581                 vuc_orig = spu_and( vuc_orig,
1582                                 spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
1583
1584                 // rotate result according to storing address
1585                 vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
1586
1587                 // store result
1588                 *((vector unsigned char*)dst) = spu_or( vuc_result,
1589                                                         vuc_orig );
1590                 dst += 8;
1591         }
1592 }
1593
1594
1595 /*
1596  * bilinear_scale_line_w16()
1597  *
1598  * processes a line of yuv-input, width has to be a multiple of 16
1599  * scaled yuv-output is written to local store buffer
1600  *
1601  * @param src buffer for 2 lines input
1602  * @param dst_ buffer for 1 line output
1603  * @param dst_width the width of the destination line
1604  * @param vf_x_scale a float vector, at each entry is the x_scale-factor
1605  * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
1606  * @param src_linestride the stride of the srcline
1607  */
1608 void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
1609
1610         unsigned char* dst = dst_;
1611
1612         unsigned int dst_x;
1613         for( dst_x=0; dst_x<dst_width; dst_x+=16) {
1614                 // address calculation for loading the 4 surrounding pixel of each calculated
1615                 // destination pixel
1616                 vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
1617                 // parallelised processing
1618                 // first range->pixel 1 2 3 4
1619                 // second range->pixel 5 6 7 8
1620                 // third range->pixel 9 10 11 12
1621                 // fourth range->pixel 13 14 15 16
1622                 vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
1623                 vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
1624                 vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
1625                 vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
1626                 vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
1627                 vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
1628                 vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
1629                 vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
1630
1631                 // calculate weight EAST-WEST
1632                 vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
1633                 vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
1634                 vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
1635                 vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
1636                 vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
1637                 vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
1638                 vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
1639                 vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
1640                 vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
1641                 vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
1642                 vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
1643                 vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
1644                 vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
1645                 vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
1646                 vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
1647                 vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
1648                 vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
1649                 vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
1650                 vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
1651                 vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
1652
1653                 // calculate address offset
1654                 //
1655                 // pixel NORTH WEST
1656                 vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
1657                 vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
1658                 vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
1659                 vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
1660
1661                 // pixel NORTH EAST-->(offpixelNW+1)
1662                 vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
1663                 vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
1664                 vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
1665                 vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
1666                 vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
1667
1668                 // SOUTH-WEST-->(offpixelNW+src_linestride)
1669                 vector unsigned int vui_srclinestride = spu_splats( src_linestride );
1670                 vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
1671                 vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
1672                 vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
1673                 vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
1674
1675                 // SOUTH-EAST-->(offpixelNW+src_linestride+1)
1676                 vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
1677                 vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
1678                 vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
1679                 vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
1680
1681                 // calculate each address
1682                 vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
1683                 vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
1684                 vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
1685                 vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
1686                 vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
1687
1688                 vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
1689                 vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
1690                 vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
1691                 vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
1692
1693                 vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
1694                 vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
1695                 vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
1696                 vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
1697
1698                 vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
1699                 vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
1700                 vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
1701                 vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
1702
1703
1704                 // get each pixel
1705                 //
1706                 // scalar load, afterwards insertion into the right position
1707                 // NORTH WEST
1708                 // first range
1709                 vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
1710                 vector unsigned char vuc_pixel_NW_first_range = spu_insert(
1711                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
1712                 vuc_pixel_NW_first_range = spu_insert(
1713                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
1714                                 vuc_pixel_NW_first_range, 7 );
1715                 vuc_pixel_NW_first_range = spu_insert(
1716                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
1717                                 vuc_pixel_NW_first_range, 11 );
1718                 vuc_pixel_NW_first_range = spu_insert(
1719                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
1720                                 vuc_pixel_NW_first_range, 15 );
1721                 // second range
1722                 vector unsigned char vuc_pixel_NW_second_range = spu_insert(
1723                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
1724                 vuc_pixel_NW_second_range = spu_insert(
1725                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
1726                                 vuc_pixel_NW_second_range, 7 );
1727                 vuc_pixel_NW_second_range = spu_insert(
1728                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
1729                                 vuc_pixel_NW_second_range, 11 );
1730                 vuc_pixel_NW_second_range = spu_insert(
1731                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
1732                                 vuc_pixel_NW_second_range, 15 );
1733                 // third range
1734                 vector unsigned char vuc_pixel_NW_third_range = spu_insert(
1735                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
1736                 vuc_pixel_NW_third_range = spu_insert(
1737                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
1738                                 vuc_pixel_NW_third_range, 7 );
1739                 vuc_pixel_NW_third_range = spu_insert(
1740                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
1741                                 vuc_pixel_NW_third_range, 11 );
1742                 vuc_pixel_NW_third_range = spu_insert(
1743                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
1744                                 vuc_pixel_NW_third_range, 15 );
1745                 // fourth range
1746                 vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
1747                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
1748                 vuc_pixel_NW_fourth_range = spu_insert(
1749                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
1750                                 vuc_pixel_NW_fourth_range, 7 );
1751                 vuc_pixel_NW_fourth_range = spu_insert(
1752                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
1753                                 vuc_pixel_NW_fourth_range, 11 );
1754                 vuc_pixel_NW_fourth_range = spu_insert(
1755                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
1756                                 vuc_pixel_NW_fourth_range, 15 );
1757
1758                 // NORTH EAST
1759                 // first range
1760                 vector unsigned char vuc_pixel_NE_first_range = spu_insert(
1761                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
1762                 vuc_pixel_NE_first_range = spu_insert(
1763                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
1764                                 vuc_pixel_NE_first_range, 7 );
1765                 vuc_pixel_NE_first_range = spu_insert(
1766                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
1767                                 vuc_pixel_NE_first_range, 11 );
1768                 vuc_pixel_NE_first_range = spu_insert(
1769                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
1770                                 vuc_pixel_NE_first_range, 15 );
1771                 // second range
1772                 vector unsigned char vuc_pixel_NE_second_range = spu_insert(
1773                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
1774                 vuc_pixel_NE_second_range = spu_insert(
1775                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
1776                                 vuc_pixel_NE_second_range, 7 );
1777                 vuc_pixel_NE_second_range = spu_insert(
1778                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
1779                                 vuc_pixel_NE_second_range, 11 );
1780                 vuc_pixel_NE_second_range = spu_insert(
1781                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
1782                                 vuc_pixel_NE_second_range, 15 );
1783                 // third range
1784                 vector unsigned char vuc_pixel_NE_third_range = spu_insert(
1785                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
1786                 vuc_pixel_NE_third_range = spu_insert(
1787                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
1788                                 vuc_pixel_NE_third_range, 7 );
1789                 vuc_pixel_NE_third_range = spu_insert(
1790                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
1791                                 vuc_pixel_NE_third_range, 11 );
1792                 vuc_pixel_NE_third_range = spu_insert(
1793                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
1794                                 vuc_pixel_NE_third_range, 15 );
1795                 // fourth range
1796                 vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
1797                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
1798                 vuc_pixel_NE_fourth_range = spu_insert(
1799                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
1800                                 vuc_pixel_NE_fourth_range, 7 );
1801                 vuc_pixel_NE_fourth_range = spu_insert(
1802                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
1803                                 vuc_pixel_NE_fourth_range, 11 );
1804                 vuc_pixel_NE_fourth_range = spu_insert(
1805                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
1806                                 vuc_pixel_NE_fourth_range, 15 );
1807
1808                 // SOUTH WEST
1809                 // first range
1810                 vector unsigned char vuc_pixel_SW_first_range = spu_insert(
1811                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
1812                 vuc_pixel_SW_first_range = spu_insert(
1813                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
1814                                 vuc_pixel_SW_first_range, 7 );
1815                 vuc_pixel_SW_first_range = spu_insert(
1816                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
1817                                 vuc_pixel_SW_first_range, 11 );
1818                 vuc_pixel_SW_first_range = spu_insert(
1819                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
1820                                 vuc_pixel_SW_first_range, 15 );
1821                 // second range
1822                 vector unsigned char vuc_pixel_SW_second_range = spu_insert(
1823                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
1824                 vuc_pixel_SW_second_range = spu_insert(
1825                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
1826                                 vuc_pixel_SW_second_range, 7 );
1827                 vuc_pixel_SW_second_range = spu_insert(
1828                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
1829                                 vuc_pixel_SW_second_range, 11 );
1830                 vuc_pixel_SW_second_range = spu_insert(
1831                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
1832                                 vuc_pixel_SW_second_range, 15 );
1833                 // third range
1834                 vector unsigned char vuc_pixel_SW_third_range = spu_insert(
1835                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
1836                 vuc_pixel_SW_third_range = spu_insert(
1837                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
1838                                 vuc_pixel_SW_third_range, 7 );
1839                 vuc_pixel_SW_third_range = spu_insert(
1840                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
1841                                 vuc_pixel_SW_third_range, 11 );
1842                 vuc_pixel_SW_third_range = spu_insert(
1843                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
1844                                 vuc_pixel_SW_third_range, 15 );
1845                 // fourth range
1846                 vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
1847                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
1848                 vuc_pixel_SW_fourth_range = spu_insert(
1849                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
1850                                 vuc_pixel_SW_fourth_range, 7 );
1851                 vuc_pixel_SW_fourth_range = spu_insert(
1852                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
1853                                 vuc_pixel_SW_fourth_range, 11 );
1854                 vuc_pixel_SW_fourth_range = spu_insert(
1855                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
1856                                 vuc_pixel_SW_fourth_range, 15 );
1857
1858                 // NORTH EAST
1859                 // first range
1860                 vector unsigned char vuc_pixel_SE_first_range = spu_insert(
1861                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
1862                 vuc_pixel_SE_first_range = spu_insert(
1863                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
1864                                 vuc_pixel_SE_first_range, 7 );
1865                 vuc_pixel_SE_first_range = spu_insert(
1866                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
1867                                 vuc_pixel_SE_first_range, 11 );
1868                 vuc_pixel_SE_first_range = spu_insert(
1869                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
1870                                 vuc_pixel_SE_first_range, 15 );
1871                 // second range
1872                 vector unsigned char vuc_pixel_SE_second_range = spu_insert(
1873                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
1874                 vuc_pixel_SE_second_range = spu_insert(
1875                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
1876                                 vuc_pixel_SE_second_range, 7 );
1877                 vuc_pixel_SE_second_range = spu_insert(
1878                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
1879                                 vuc_pixel_SE_second_range, 11 );
1880                 vuc_pixel_SE_second_range = spu_insert(
1881                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
1882                                 vuc_pixel_SE_second_range, 15 );
1883                 // third range
1884                 vector unsigned char vuc_pixel_SE_third_range = spu_insert(
1885                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
1886                 vuc_pixel_SE_third_range = spu_insert(
1887                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
1888                                 vuc_pixel_SE_third_range, 7 );
1889                 vuc_pixel_SE_third_range = spu_insert(
1890                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
1891                                 vuc_pixel_SE_third_range, 11 );
1892                 vuc_pixel_SE_third_range = spu_insert(
1893                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
1894                                 vuc_pixel_SE_third_range, 15 );
1895                 // fourth range
1896                 vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
1897                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
1898                 vuc_pixel_SE_fourth_range = spu_insert(
1899                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
1900                                 vuc_pixel_SE_fourth_range, 7 );
1901                 vuc_pixel_SE_fourth_range = spu_insert(
1902                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
1903                                 vuc_pixel_SE_fourth_range, 11 );
1904                 vuc_pixel_SE_fourth_range = spu_insert(
1905                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
1906                                 vuc_pixel_SE_fourth_range, 15 );
1907
1908
1909
1910                 // convert to float
1911                 vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
1912                 vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
1913                 vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
1914                 vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
1915
1916                 vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
1917                 vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
1918                 vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
1919                 vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
1920
1921                 vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
1922                 vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
1923                 vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
1924                 vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
1925
1926                 vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
1927                 vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
1928                 vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
1929                 vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
1930
1931                 // first linear interpolation: EWtop
1932                 // EWtop = NW + EWweight*(NE-NW)
1933                 //
1934                 // first range
1935                 vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
1936                 vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
1937                                                                 vf_EWtop_first_range_tmp,
1938                                                                 vf_pixel_NW_first_range );
1939
1940                 // second range
1941                 vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
1942                 vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
1943                                                                 vf_EWtop_second_range_tmp,
1944                                                                 vf_pixel_NW_second_range );
1945
1946                 // third range
1947                 vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
1948                 vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
1949                                                                 vf_EWtop_third_range_tmp,
1950                                                                 vf_pixel_NW_third_range );
1951
1952                 // fourth range
1953                 vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
1954                 vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
1955                                                                 vf_EWtop_fourth_range_tmp,
1956                                                                 vf_pixel_NW_fourth_range );
1957
1958
1959
1960                 // second linear interpolation: EWbottom
1961                 // EWbottom = SW + EWweight*(SE-SW)
1962                 //
1963                 // first range
1964                 vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
1965                 vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
1966                                                                 vf_EWbottom_first_range_tmp,
1967                                                                 vf_pixel_SW_first_range );
1968
1969                 // second range
1970                 vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
1971                 vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
1972                                                                 vf_EWbottom_second_range_tmp,
1973                                                                 vf_pixel_SW_second_range );
1974                 // first range
1975                 vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
1976                 vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
1977                                                                 vf_EWbottom_third_range_tmp,
1978                                                                 vf_pixel_SW_third_range );
1979
1980                 // first range
1981                 vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
1982                 vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
1983                                                                 vf_EWbottom_fourth_range_tmp,
1984                                                                 vf_pixel_SW_fourth_range );
1985
1986
1987
1988                 // third linear interpolation: the bilinear interpolated value
1989                 // result = EWtop + NSweight*(EWbottom-EWtop);
1990                 //
1991                 // first range
1992                 vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
1993                 vector float vf_result_first_range = spu_madd( vf_NSweight,
1994                                                                 vf_result_first_range_tmp,
1995                                                                 vf_EWtop_first_range );
1996
1997                 // second range
1998                 vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
1999                 vector float vf_result_second_range = spu_madd( vf_NSweight,
2000                                                                 vf_result_second_range_tmp,
2001                                                                 vf_EWtop_second_range );
2002
2003                 // third range
2004                 vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
2005                 vector float vf_result_third_range = spu_madd( vf_NSweight,
2006                                                                 vf_result_third_range_tmp,
2007                                                                 vf_EWtop_third_range );
2008
2009                 // fourth range
2010                 vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
2011                 vector float vf_result_fourth_range = spu_madd( vf_NSweight,
2012                                                                 vf_result_fourth_range_tmp,
2013                                                                 vf_EWtop_fourth_range );
2014
2015
2016
2017                 // convert back: using saturated arithmetic
2018                 vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
2019                 vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
2020                 vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
2021                 vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
2022
2023                 // merge results->lower,upper
2024                 vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
2025                                                                             0x13, 0x17, 0x1B, 0x1F,
2026                                                                             0x00, 0x00, 0x00, 0x00,
2027                                                                             0x00, 0x00, 0x00, 0x00 };
2028
2029                 vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
2030                                                                             0x00, 0x00, 0x00, 0x00,
2031                                                                             0x03, 0x07, 0x0B, 0x0F,
2032                                                                             0x13, 0x17, 0x1B, 0x1F };
2033
2034                 vector unsigned char vuc_result_first_second =
2035                                                 spu_shuffle( (vector unsigned char) vui_result_first_range,
2036                                                                  (vector unsigned char) vui_result_second_range,
2037                                                                 vuc_mask_merge_result_first_second );
2038
2039                 vector unsigned char vuc_result_third_fourth =
2040                                                 spu_shuffle( (vector unsigned char) vui_result_third_range,
2041                                                                  (vector unsigned char) vui_result_fourth_range,
2042                                                                 vuc_mask_merge_result_third_fourth );
2043
2044                 // store result
2045                 *((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
2046                                                         vuc_result_third_fourth );
2047                 dst += 16;
2048         }
2049 }
2050