SDL-1.2.14
[sdl_omap.git] / src / video / ps3 / spulibs / bilin_scaler.c
1 /*
2  * SDL - Simple DirectMedia Layer
3  * CELL BE Support for PS3 Framebuffer
4  * Copyright (C) 2008, 2009 International Business Machines Corporation
5  *
6  * This library is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU Lesser General Public License as published
8  * by the Free Software Foundation; either version 2.1 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful, but
12  * WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19  * USA
20  *
21  *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
22  *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
23  *  SPE code based on research by:
24  *  Rene Becker
25  *  Thimo Emmerich
26  */
27
28 #include "spu_common.h"
29
30 #include <spu_intrinsics.h>
31 #include <spu_mfcio.h>
32
33 // Debugging
34 //#define DEBUG
35
36 #ifdef DEBUG
37 #define deprintf(fmt, args... ) \
38         fprintf( stdout, fmt, ##args ); \
39         fflush( stdout );
40 #else
41 #define deprintf( fmt, args... )
42 #endif
43
44 struct scale_parms_t parms __attribute__((aligned(128)));
45
46 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
47  * there might be the need to retrieve misaligned data, adjust
48  * incoming v and u plane to be able to handle this (add 128)
49  */
50 unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
51 unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
52 unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
53
54 /* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
55 unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
56 unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
57 unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
58
59 /* some vectors needed by the float to int conversion */
60 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
61 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
62
63 void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
64 void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
65
66 void scale_srcw16_dstw16();
67 void scale_srcw16_dstw32();
68 void scale_srcw32_dstw16();
69 void scale_srcw32_dstw32();
70
71 int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
72 {
73         deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
74         /* DMA transfer for the input parameters */
75         spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
76         DMA_WAIT_TAG(TAG_INIT);
77
78         deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
79                         parms.dst_pixel_width, parms.dst_pixel_height);
80
81         if(parms.src_pixel_width & 0x1f) {
82                 if(parms.dst_pixel_width & 0x1F) {
83                         deprintf("[SPU] Using scale_srcw16_dstw16\n");
84                         scale_srcw16_dstw16();
85                 } else {
86                         deprintf("[SPU] Using scale_srcw16_dstw32\n");
87                         scale_srcw16_dstw32();
88                 }
89         } else {
90                 if(parms.dst_pixel_width & 0x1F) {
91                         deprintf("[SPU] Using scale_srcw32_dstw16\n");
92                         scale_srcw32_dstw16();
93                 } else {
94                         deprintf("[SPU] Using scale_srcw32_dstw32\n");
95                         scale_srcw32_dstw32();
96                 }
97         }
98         deprintf("[SPU] bilin_scaler_spu... done!\n");
99
100         return 0;
101 }
102
103
104 /*
105  * vfloat_to_vuint()
106  *
107  * converts a float vector to an unsinged int vector using saturated
108  * arithmetic
109  *
110  * @param vec_s float vector for conversion
111  * @returns converted unsigned int vector
112  */
113 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
114         vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
115         vec_s = spu_sel(vec_s, vec_0_1, select_1);
116
117         vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
118         vec_s = spu_sel(vec_s, vec_255, select_2);
119         return spu_convtu(vec_s,0);
120 }
121
122
123 /*
124  * scale_srcw16_dstw16()
125  *
126  * processes an input image of width 16
127  * scaling is done to a width 16
128  * result stored in RAM
129  */
130 void scale_srcw16_dstw16() {
131         // extract parameters
132         unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
133
134         unsigned int src_width = parms.src_pixel_width;
135         unsigned int src_height = parms.src_pixel_height;
136         unsigned int dst_width = parms.dst_pixel_width;
137         unsigned int dst_height = parms.dst_pixel_height;
138
139         // YVU
140         unsigned int src_linestride_y = src_width;
141         unsigned int src_dbl_linestride_y = src_width<<1;
142         unsigned int src_linestride_vu = src_width>>1;
143         unsigned int src_dbl_linestride_vu = src_width;
144
145         // scaled YVU
146         unsigned int scaled_src_linestride_y = dst_width;
147
148         // ram addresses
149         unsigned char* src_addr_y = parms.y_plane;
150         unsigned char* src_addr_v = parms.v_plane;
151         unsigned char* src_addr_u = parms.u_plane;
152
153         // for handling misalignment, addresses are precalculated
154         unsigned char* precalc_src_addr_v = src_addr_v;
155         unsigned char* precalc_src_addr_u = src_addr_u;
156
157         unsigned int dst_picture_size = dst_width*dst_height;
158
159         // Sizes for destination
160         unsigned int dst_dbl_linestride_y = dst_width<<1;
161         unsigned int dst_dbl_linestride_vu = dst_width>>1;
162
163         // Perform address calculation for Y, V and U in main memory with dst_addr as base
164         unsigned char* dst_addr_main_memory_y = dst_addr;
165         unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
166         unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
167
168         // calculate scale factors
169         vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
170         float y_scale = (float)src_height/(float)dst_height;
171
172         // double buffered processing
173         // buffer switching
174         unsigned int curr_src_idx = 0;
175         unsigned int curr_dst_idx = 0;
176         unsigned int next_src_idx, next_dst_idx;
177
178         // 2 lines y as output, upper and lowerline
179         unsigned int curr_interpl_y_upper = 0;
180         unsigned int next_interpl_y_upper;
181         unsigned int curr_interpl_y_lower, next_interpl_y_lower;
182         // only 1 line v/u output, both planes have the same dimension
183         unsigned int curr_interpl_vu = 0;
184         unsigned int next_interpl_vu;
185
186         // weights, calculated in every loop iteration
187         vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
188         vector float vf_next_NSweight_y_upper;
189         vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
190         vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
191         vector float vf_next_NSweight_vu;
192
193         // line indices for the src picture
194         float curr_src_y_upper = 0.0f, next_src_y_upper;
195         float curr_src_y_lower, next_src_y_lower;
196         float curr_src_vu = 0.0f, next_src_vu;
197
198         // line indices for the dst picture
199         unsigned int dst_y=0, dst_vu=0;
200
201         // offset for the v and u plane to handle misalignement
202         unsigned int curr_lsoff_v = 0, next_lsoff_v;
203         unsigned int curr_lsoff_u = 0, next_lsoff_u;
204
205         // calculate lower line indices
206         curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
207         curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
208         // lower line weight
209         vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
210
211
212         // start partially double buffered processing
213         // get initial data, 2 sets of y, 1 set v, 1 set u
214         mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
215         mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
216                         (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
217                         src_dbl_linestride_y,
218                         RETR_BUF,
219                         0, 0 );
220         mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
221         mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
222
223         /* iteration loop
224          * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
225          * the scaled output is 2 lines y, 1 line v, 1 line u
226          * the yuv2rgb-converted output is stored to RAM
227          */
228         for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
229                 dst_y = dst_vu<<1;
230
231                 // calculate next indices
232                 next_src_vu = ((float)dst_vu+1)*y_scale;
233                 next_src_y_upper = ((float)dst_y+2)*y_scale;
234                 next_src_y_lower = ((float)dst_y+3)*y_scale;
235
236                 next_interpl_vu = (unsigned int) next_src_vu;
237                 next_interpl_y_upper = (unsigned int) next_src_y_upper;
238                 next_interpl_y_lower = (unsigned int) next_src_y_lower;
239
240                 // calculate weight NORTH-SOUTH
241                 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
242                 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
243                 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
244
245                 // get next lines
246                 next_src_idx = curr_src_idx^1;
247                 next_dst_idx = curr_dst_idx^1;
248
249                 // 4 lines y
250                 mfc_get( y_plane[next_src_idx],
251                                 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
252                                 src_dbl_linestride_y,
253                                 RETR_BUF+next_src_idx,
254                                 0, 0 );
255                 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
256                                 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
257                                 src_dbl_linestride_y,
258                                 RETR_BUF+next_src_idx,
259                                 0, 0 );
260
261                 // 2 lines v
262                 precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
263                 next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
264                 mfc_get( v_plane[next_src_idx],
265                                 ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
266                                 src_dbl_linestride_vu+(next_lsoff_v<<1),
267                                 RETR_BUF+next_src_idx,
268                                 0, 0 );
269                 // 2 lines u
270                 precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
271                 next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
272                 mfc_get( u_plane[next_src_idx],
273                                 ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
274                                 src_dbl_linestride_vu+(next_lsoff_v<<1),
275                                 RETR_BUF+next_src_idx,
276                                 0, 0 );
277
278                 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
279
280                 // scaling
281                 // work line y_upper
282                 bilinear_scale_line_w16( y_plane[curr_src_idx],
283                                 scaled_y_plane[curr_src_idx],
284                                 dst_width,
285                                 vf_x_scale,
286                                 vf_curr_NSweight_y_upper,
287                                 src_linestride_y );
288                 // work line y_lower
289                 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
290                                 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
291                                 dst_width,
292                                 vf_x_scale,
293                                 vf_curr_NSweight_y_lower,
294                                 src_linestride_y );
295                 // work line v
296                 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
297                                 scaled_v_plane[curr_src_idx],
298                                 dst_width>>1,
299                                 vf_x_scale,
300                                 vf_curr_NSweight_vu,
301                                 src_linestride_vu );
302                 // work line u
303                 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
304                                 scaled_u_plane[curr_src_idx],
305                                 dst_width>>1,
306                                 vf_x_scale,
307                                 vf_curr_NSweight_vu,
308                                 src_linestride_vu );
309
310
311                 // Store the result back to main memory into a destination buffer in YUV format
312                 //---------------------------------------------------------------------------------------------
313                 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
314
315                 // Perform three DMA transfers to 3 different locations in the main memory!
316                 // dst_width:   Pixel width of destination image
317                 // dst_addr:    Destination address in main memory
318                 // dst_vu:      Counter which is incremented one by one
319                 // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
320                 mfc_put(        scaled_y_plane[curr_src_idx],                                   // What from local store (addr)
321                                 (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),   // Destination in main memory (addr)
322                                 dst_dbl_linestride_y,                                           // Two Y lines (depending on the widht of the destination resolution)
323                                 STR_BUF+curr_dst_idx,                                           // Tag
324                                 0, 0 );
325
326                 mfc_put(        scaled_v_plane[curr_src_idx],                                   // What from local store (addr)
327                                 (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
328                                 dst_dbl_linestride_vu,                                          // Two V lines (depending on the widht of the destination resolution)
329                                 STR_BUF+curr_dst_idx,                                           // Tag
330                                 0, 0 );
331
332                 mfc_put(        scaled_u_plane[curr_src_idx],                                   // What from local store (addr)
333                                 (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
334                                 dst_dbl_linestride_vu,                                          // Two U lines (depending on the widht of the destination resolution)
335                                 STR_BUF+curr_dst_idx,                                           // Tag
336                                 0, 0 );
337                 //---------------------------------------------------------------------------------------------
338
339
340                 // update for next cycle
341                 curr_src_idx = next_src_idx;
342                 curr_dst_idx = next_dst_idx;
343
344                 curr_interpl_y_upper = next_interpl_y_upper;
345                 curr_interpl_y_lower = next_interpl_y_lower;
346                 curr_interpl_vu = next_interpl_vu;
347
348                 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
349                 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
350                 vf_curr_NSweight_vu = vf_next_NSweight_vu;
351
352                 curr_src_y_upper = next_src_y_upper;
353                 curr_src_y_lower = next_src_y_lower;
354                 curr_src_vu = next_src_vu;
355
356                 curr_lsoff_v = next_lsoff_v;
357                 curr_lsoff_u = next_lsoff_u;
358         }
359
360
361
362         DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
363
364         // scaling
365         // work line y_upper
366         bilinear_scale_line_w16( y_plane[curr_src_idx],
367                         scaled_y_plane[curr_src_idx],
368                         dst_width,
369                         vf_x_scale,
370                         vf_curr_NSweight_y_upper,
371                         src_linestride_y );
372         // work line y_lower
373         bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
374                         scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
375                         dst_width,
376                         vf_x_scale,
377                         vf_curr_NSweight_y_lower,
378                         src_linestride_y );
379         // work line v
380         bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
381                         scaled_v_plane[curr_src_idx],
382                         dst_width>>1,
383                         vf_x_scale,
384                         vf_curr_NSweight_vu,
385                         src_linestride_vu );
386         // work line u
387         bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
388                         scaled_u_plane[curr_src_idx],
389                         dst_width>>1,
390                         vf_x_scale,
391                         vf_curr_NSweight_vu,
392                         src_linestride_vu );
393
394
395         // Store the result back to main memory into a destination buffer in YUV format
396         //---------------------------------------------------------------------------------------------
397         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
398
399         // Perform three DMA transfers to 3 different locations in the main memory!
400         // dst_width:   Pixel width of destination image
401         // dst_addr:    Destination address in main memory
402         // dst_vu:      Counter which is incremented one by one
403         // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
404         mfc_put(        scaled_y_plane[curr_src_idx],                                   // What from local store (addr)
405                         (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),   // Destination in main memory (addr)
406                         dst_dbl_linestride_y,                                           // Two Y lines (depending on the widht of the destination resolution)
407                         STR_BUF+curr_dst_idx,                                           // Tag
408                         0, 0 );
409
410         mfc_put(        scaled_v_plane[curr_src_idx],                                   // What from local store (addr)
411                         (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
412                         dst_dbl_linestride_vu,                                          // Two V lines (depending on the widht of the destination resolution)
413                         STR_BUF+curr_dst_idx,                                           // Tag
414                         0, 0 );
415
416         mfc_put(        scaled_u_plane[curr_src_idx],                                   // What from local store (addr)
417                         (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
418                         dst_dbl_linestride_vu,                                          // Two U lines (depending on the widht of the destination resolution)
419                         STR_BUF+curr_dst_idx,                                           // Tag
420                         0, 0 );
421
422         // wait for completion
423         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
424         //---------------------------------------------------------------------------------------------
425 }
426
427
428 /*
429  * scale_srcw16_dstw32()
430  *
431  * processes an input image of width 16
432  * scaling is done to a width 32
433  * yuv2rgb conversion on a width of 32
434  * result stored in RAM
435  */
436 void scale_srcw16_dstw32() {
437         // extract parameters
438         unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
439
440         unsigned int src_width = parms.src_pixel_width;
441         unsigned int src_height = parms.src_pixel_height;
442         unsigned int dst_width = parms.dst_pixel_width;
443         unsigned int dst_height = parms.dst_pixel_height;
444
445         // YVU
446         unsigned int src_linestride_y = src_width;
447         unsigned int src_dbl_linestride_y = src_width<<1;
448         unsigned int src_linestride_vu = src_width>>1;
449         unsigned int src_dbl_linestride_vu = src_width;
450         // scaled YVU
451         unsigned int scaled_src_linestride_y = dst_width;
452
453         // ram addresses
454         unsigned char* src_addr_y = parms.y_plane;
455         unsigned char* src_addr_v = parms.v_plane;
456         unsigned char* src_addr_u = parms.u_plane;
457
458         unsigned int dst_picture_size = dst_width*dst_height;
459
460         // Sizes for destination
461         unsigned int dst_dbl_linestride_y = dst_width<<1;
462         unsigned int dst_dbl_linestride_vu = dst_width>>1;
463
464         // Perform address calculation for Y, V and U in main memory with dst_addr as base
465         unsigned char* dst_addr_main_memory_y = dst_addr;
466         unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
467         unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
468
469
470         // for handling misalignment, addresses are precalculated
471         unsigned char* precalc_src_addr_v = src_addr_v;
472         unsigned char* precalc_src_addr_u = src_addr_u;
473
474         // calculate scale factors
475         vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
476         float y_scale = (float)src_height/(float)dst_height;
477
478         // double buffered processing
479         // buffer switching
480         unsigned int curr_src_idx = 0;
481         unsigned int curr_dst_idx = 0;
482         unsigned int next_src_idx, next_dst_idx;
483
484         // 2 lines y as output, upper and lowerline
485         unsigned int curr_interpl_y_upper = 0;
486         unsigned int next_interpl_y_upper;
487         unsigned int curr_interpl_y_lower, next_interpl_y_lower;
488         // only 1 line v/u output, both planes have the same dimension
489         unsigned int curr_interpl_vu = 0;
490         unsigned int next_interpl_vu;
491
492         // weights, calculated in every loop iteration
493         vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
494         vector float vf_next_NSweight_y_upper;
495         vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
496         vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
497         vector float vf_next_NSweight_vu;
498
499         // line indices for the src picture
500         float curr_src_y_upper = 0.0f, next_src_y_upper;
501         float curr_src_y_lower, next_src_y_lower;
502         float curr_src_vu = 0.0f, next_src_vu;
503
504         // line indices for the dst picture
505         unsigned int dst_y=0, dst_vu=0;
506
507         // offset for the v and u plane to handle misalignement
508         unsigned int curr_lsoff_v = 0, next_lsoff_v;
509         unsigned int curr_lsoff_u = 0, next_lsoff_u;
510
511         // calculate lower line idices
512         curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
513         curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
514         // lower line weight
515         vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
516
517
518         // start partially double buffered processing
519         // get initial data, 2 sets of y, 1 set v, 1 set u
520         mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
521         mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
522                         (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
523                         src_dbl_linestride_y,
524                         RETR_BUF,
525                         0, 0 );
526         mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
527         mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
528
529         // iteration loop
530         // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
531         // the scaled output is 2 lines y, 1 line v, 1 line u
532         // the yuv2rgb-converted output is stored to RAM
533         for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
534                 dst_y = dst_vu<<1;
535
536                 // calculate next indices
537                 next_src_vu = ((float)dst_vu+1)*y_scale;
538                 next_src_y_upper = ((float)dst_y+2)*y_scale;
539                 next_src_y_lower = ((float)dst_y+3)*y_scale;
540
541                 next_interpl_vu = (unsigned int) next_src_vu;
542                 next_interpl_y_upper = (unsigned int) next_src_y_upper;
543                 next_interpl_y_lower = (unsigned int) next_src_y_lower;
544
545                 // calculate weight NORTH-SOUTH
546                 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
547                 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
548                 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
549
550                 // get next lines
551                 next_src_idx = curr_src_idx^1;
552                 next_dst_idx = curr_dst_idx^1;
553
554                 // 4 lines y
555                 mfc_get( y_plane[next_src_idx],
556                                 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
557                                 src_dbl_linestride_y,
558                                 RETR_BUF+next_src_idx,
559                                 0, 0 );
560                 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
561                                 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
562                                 src_dbl_linestride_y,
563                                 RETR_BUF+next_src_idx,
564                                 0, 0 );
565
566                 // 2 lines v
567                 precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
568                 next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
569                 mfc_get( v_plane[next_src_idx],
570                                 ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
571                                 src_dbl_linestride_vu+(next_lsoff_v<<1),
572                                 RETR_BUF+next_src_idx,
573                                 0, 0 );
574                 // 2 lines u
575                 precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
576                 next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
577                 mfc_get( u_plane[next_src_idx],
578                                 ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
579                                 src_dbl_linestride_vu+(next_lsoff_v<<1),
580                                 RETR_BUF+next_src_idx,
581                                 0, 0 );
582
583                 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
584
585                 // scaling
586                 // work line y_upper
587                 bilinear_scale_line_w16( y_plane[curr_src_idx],
588                                 scaled_y_plane[curr_src_idx],
589                                 dst_width,
590                                 vf_x_scale,
591                                 vf_curr_NSweight_y_upper,
592                                 src_linestride_y );
593                 // work line y_lower
594                 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
595                                 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
596                                 dst_width,
597                                 vf_x_scale,
598                                 vf_curr_NSweight_y_lower,
599                                 src_linestride_y );
600                 // work line v
601                 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
602                                 scaled_v_plane[curr_src_idx],
603                                 dst_width>>1,
604                                 vf_x_scale,
605                                 vf_curr_NSweight_vu,
606                                 src_linestride_vu );
607                 // work line u
608                 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
609                                 scaled_u_plane[curr_src_idx],
610                                 dst_width>>1,
611                                 vf_x_scale,
612                                 vf_curr_NSweight_vu,
613                                 src_linestride_vu );
614
615                 //---------------------------------------------------------------------------------------------
616                 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
617
618                 // Perform three DMA transfers to 3 different locations in the main memory!
619                 // dst_width:   Pixel width of destination image
620                 // dst_addr:    Destination address in main memory
621                 // dst_vu:      Counter which is incremented one by one
622                 // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
623
624                 mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
625                                 (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
626                                 dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
627                                 STR_BUF+curr_dst_idx,                                                           // Tag
628                                 0, 0 );
629
630                 mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
631                                 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
632                                 dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
633                                 STR_BUF+curr_dst_idx,                                                           // Tag
634                                 0, 0 );
635
636                 mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
637                                 (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),        // Destination in main memory (addr)
638                                 dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
639                                 STR_BUF+curr_dst_idx,                                                           // Tag
640                                 0, 0 );
641                 //---------------------------------------------------------------------------------------------
642
643
644                 // update for next cycle
645                 curr_src_idx = next_src_idx;
646                 curr_dst_idx = next_dst_idx;
647
648                 curr_interpl_y_upper = next_interpl_y_upper;
649                 curr_interpl_y_lower = next_interpl_y_lower;
650                 curr_interpl_vu = next_interpl_vu;
651
652                 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
653                 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
654                 vf_curr_NSweight_vu = vf_next_NSweight_vu;
655
656                 curr_src_y_upper = next_src_y_upper;
657                 curr_src_y_lower = next_src_y_lower;
658                 curr_src_vu = next_src_vu;
659
660                 curr_lsoff_v = next_lsoff_v;
661                 curr_lsoff_u = next_lsoff_u;
662         }
663
664
665
666         DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
667
668         // scaling
669         // work line y_upper
670         bilinear_scale_line_w16( y_plane[curr_src_idx],
671                         scaled_y_plane[curr_src_idx],
672                         dst_width,
673                         vf_x_scale,
674                         vf_curr_NSweight_y_upper,
675                         src_linestride_y );
676         // work line y_lower
677         bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
678                         scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
679                         dst_width,
680                         vf_x_scale,
681                         vf_curr_NSweight_y_lower,
682                         src_linestride_y );
683         // work line v
684         bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
685                         scaled_v_plane[curr_src_idx],
686                         dst_width>>1,
687                         vf_x_scale,
688                         vf_curr_NSweight_vu,
689                         src_linestride_vu );
690         // work line u
691         bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
692                         scaled_u_plane[curr_src_idx],
693                         dst_width>>1,
694                         vf_x_scale,
695                         vf_curr_NSweight_vu,
696                         src_linestride_vu );
697
698         //---------------------------------------------------------------------------------------------
699         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
700
701         // Perform three DMA transfers to 3 different locations in the main memory!
702         // dst_width:   Pixel width of destination image
703         // dst_addr:    Destination address in main memory
704         // dst_vu:      Counter which is incremented one by one
705         // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
706
707         mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
708                         (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
709                         dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
710                         STR_BUF+curr_dst_idx,                                                           // Tag
711                         0, 0 );
712
713         mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
714                         (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
715                         dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
716                         STR_BUF+curr_dst_idx,                                                           // Tag
717                         0, 0 );
718
719         mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
720                         (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),        // Destination in main memory (addr)
721                         dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
722                         STR_BUF+curr_dst_idx,                                                           // Tag
723                         0, 0 );
724
725         // wait for completion
726         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
727         //---------------------------------------------------------------------------------------------
728 }
729
730
731 /*
732  * scale_srcw32_dstw16()
733  *
734  * processes an input image of width 32
735  * scaling is done to a width 16
736  * yuv2rgb conversion on a width of 16
737  * result stored in RAM
738  */
739 void scale_srcw32_dstw16() {
740         // extract parameters
741         unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
742
743         unsigned int src_width = parms.src_pixel_width;
744         unsigned int src_height = parms.src_pixel_height;
745         unsigned int dst_width = parms.dst_pixel_width;
746         unsigned int dst_height = parms.dst_pixel_height;
747
748         // YVU
749         unsigned int src_linestride_y = src_width;
750         unsigned int src_dbl_linestride_y = src_width<<1;
751         unsigned int src_linestride_vu = src_width>>1;
752         unsigned int src_dbl_linestride_vu = src_width;
753         // scaled YVU
754         unsigned int scaled_src_linestride_y = dst_width;
755
756         // ram addresses
757         unsigned char* src_addr_y = parms.y_plane;
758         unsigned char* src_addr_v = parms.v_plane;
759         unsigned char* src_addr_u = parms.u_plane;
760
761         unsigned int dst_picture_size = dst_width*dst_height;
762
763         // Sizes for destination
764         unsigned int dst_dbl_linestride_y = dst_width<<1;
765         unsigned int dst_dbl_linestride_vu = dst_width>>1;
766
767         // Perform address calculation for Y, V and U in main memory with dst_addr as base
768         unsigned char* dst_addr_main_memory_y = dst_addr;
769         unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
770         unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
771
772         // calculate scale factors
773         vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
774         float y_scale = (float)src_height/(float)dst_height;
775
776         // double buffered processing
777         // buffer switching
778         unsigned int curr_src_idx = 0;
779         unsigned int curr_dst_idx = 0;
780         unsigned int next_src_idx, next_dst_idx;
781
782         // 2 lines y as output, upper and lowerline
783         unsigned int curr_interpl_y_upper = 0;
784         unsigned int next_interpl_y_upper;
785         unsigned int curr_interpl_y_lower, next_interpl_y_lower;
786         // only 1 line v/u output, both planes have the same dimension
787         unsigned int curr_interpl_vu = 0;
788         unsigned int next_interpl_vu;
789
790         // weights, calculated in every loop iteration
791         vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
792         vector float vf_next_NSweight_y_upper;
793         vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
794         vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
795         vector float vf_next_NSweight_vu;
796
797         // line indices for the src picture
798         float curr_src_y_upper = 0.0f, next_src_y_upper;
799         float curr_src_y_lower, next_src_y_lower;
800         float curr_src_vu = 0.0f, next_src_vu;
801
802         // line indices for the dst picture
803         unsigned int dst_y=0, dst_vu=0;
804
805         // calculate lower line idices
806         curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
807         curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
808         // lower line weight
809         vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
810
811
812         // start partially double buffered processing
813         // get initial data, 2 sets of y, 1 set v, 1 set u
814         mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
815         mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
816                         (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
817                         src_dbl_linestride_y,
818                         RETR_BUF,
819                         0, 0 );
820         mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
821         mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
822
823         // iteration loop
824         // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
825         // the scaled output is 2 lines y, 1 line v, 1 line u
826         // the yuv2rgb-converted output is stored to RAM
827         for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
828                 dst_y = dst_vu<<1;
829
830                 // calculate next indices
831                 next_src_vu = ((float)dst_vu+1)*y_scale;
832                 next_src_y_upper = ((float)dst_y+2)*y_scale;
833                 next_src_y_lower = ((float)dst_y+3)*y_scale;
834
835                 next_interpl_vu = (unsigned int) next_src_vu;
836                 next_interpl_y_upper = (unsigned int) next_src_y_upper;
837                 next_interpl_y_lower = (unsigned int) next_src_y_lower;
838
839                 // calculate weight NORTH-SOUTH
840                 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
841                 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
842                 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
843
844                 // get next lines
845                 next_src_idx = curr_src_idx^1;
846                 next_dst_idx = curr_dst_idx^1;
847
848                 // 4 lines y
849                 mfc_get( y_plane[next_src_idx],
850                                 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
851                                 src_dbl_linestride_y,
852                                 RETR_BUF+next_src_idx,
853                                 0, 0 );
854                 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
855                                 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
856                                 src_dbl_linestride_y,
857                                 RETR_BUF+next_src_idx,
858                                 0, 0 );
859
860                 // 2 lines v
861                 mfc_get( v_plane[next_src_idx],
862                                 (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
863                                 src_dbl_linestride_vu,
864                                 RETR_BUF+next_src_idx,
865                                 0, 0 );
866                 // 2 lines u
867                 mfc_get( u_plane[next_src_idx],
868                                 (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
869                                 src_dbl_linestride_vu,
870                                 RETR_BUF+next_src_idx,
871                                 0, 0 );
872
873                 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
874
875                 // scaling
876                 // work line y_upper
877                 bilinear_scale_line_w16( y_plane[curr_src_idx],
878                                 scaled_y_plane[curr_src_idx],
879                                 dst_width,
880                                 vf_x_scale,
881                                 vf_curr_NSweight_y_upper,
882                                 src_linestride_y );
883                 // work line y_lower
884                 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
885                                 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
886                                 dst_width,
887                                 vf_x_scale,
888                                 vf_curr_NSweight_y_lower,
889                                 src_linestride_y );
890                 // work line v
891                 bilinear_scale_line_w16( v_plane[curr_src_idx],
892                                 scaled_v_plane[curr_src_idx],
893                                 dst_width>>1,
894                                 vf_x_scale,
895                                 vf_curr_NSweight_vu,
896                                 src_linestride_vu );
897                 // work line u
898                 bilinear_scale_line_w16( u_plane[curr_src_idx],
899                                 scaled_u_plane[curr_src_idx],
900                                 dst_width>>1,
901                                 vf_x_scale,
902                                 vf_curr_NSweight_vu,
903                                 src_linestride_vu );
904
905                 //---------------------------------------------------------------------------------------------
906                 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
907
908                 // Perform three DMA transfers to 3 different locations in the main memory!
909                 // dst_width:   Pixel width of destination image
910                 // dst_addr:    Destination address in main memory
911                 // dst_vu:      Counter which is incremented one by one
912                 // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
913
914                 mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
915                                 (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
916                                 dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
917                                 STR_BUF+curr_dst_idx,                                                           // Tag
918                                 0, 0 );
919
920                 mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
921                                 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
922                                 dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
923                                 STR_BUF+curr_dst_idx,                                                           // Tag
924                                 0, 0 );
925
926                 mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
927                                 (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
928                                 dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
929                                 STR_BUF+curr_dst_idx,                                                           // Tag
930                                 0, 0 );
931                 //---------------------------------------------------------------------------------------------
932
933
934                 // update for next cycle
935                 curr_src_idx = next_src_idx;
936                 curr_dst_idx = next_dst_idx;
937
938                 curr_interpl_y_upper = next_interpl_y_upper;
939                 curr_interpl_y_lower = next_interpl_y_lower;
940                 curr_interpl_vu = next_interpl_vu;
941
942                 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
943                 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
944                 vf_curr_NSweight_vu = vf_next_NSweight_vu;
945
946                 curr_src_y_upper = next_src_y_upper;
947                 curr_src_y_lower = next_src_y_lower;
948                 curr_src_vu = next_src_vu;
949         }
950
951
952
953         DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
954
955         // scaling
956         // work line y_upper
957         bilinear_scale_line_w16( y_plane[curr_src_idx],
958                         scaled_y_plane[curr_src_idx],
959                         dst_width,
960                         vf_x_scale,
961                         vf_curr_NSweight_y_upper,
962                         src_linestride_y );
963         // work line y_lower
964         bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
965                         scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
966                         dst_width,
967                         vf_x_scale,
968                         vf_curr_NSweight_y_lower,
969                         src_linestride_y );
970         // work line v
971         bilinear_scale_line_w16( v_plane[curr_src_idx],
972                         scaled_v_plane[curr_src_idx],
973                         dst_width>>1,
974                         vf_x_scale,
975                         vf_curr_NSweight_vu,
976                         src_linestride_vu );
977         // work line u
978         bilinear_scale_line_w16( u_plane[curr_src_idx],
979                         scaled_u_plane[curr_src_idx],
980                         dst_width>>1,
981                         vf_x_scale,
982                         vf_curr_NSweight_vu,
983                         src_linestride_vu );
984
985
986         //---------------------------------------------------------------------------------------------
987         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
988
989         // Perform three DMA transfers to 3 different locations in the main memory!
990         // dst_width:   Pixel width of destination image
991         // dst_addr:    Destination address in main memory
992         // dst_vu:      Counter which is incremented one by one
993         // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
994
995         mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
996                         (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
997                         dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
998                         STR_BUF+curr_dst_idx,                                                           // Tag
999                         0, 0 );
1000
1001         mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
1002                         (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1003                         dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
1004                         STR_BUF+curr_dst_idx,                                                           // Tag
1005                         0, 0 );
1006
1007         mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
1008                         (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1009                         dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
1010                         STR_BUF+curr_dst_idx,                                                           // Tag
1011                         0, 0 );
1012
1013         // wait for completion
1014         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1015         //---------------------------------------------------------------------------------------------
1016 }
1017
1018
1019 /**
1020  * scale_srcw32_dstw32()
1021  *
1022  * processes an input image of width 32
1023  * scaling is done to a width 32
1024  * yuv2rgb conversion on a width of 32
1025  * result stored in RAM
1026  */
1027 void scale_srcw32_dstw32() {
1028         // extract parameters
1029         unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
1030
1031         unsigned int src_width = parms.src_pixel_width;
1032         unsigned int src_height = parms.src_pixel_height;
1033         unsigned int dst_width = parms.dst_pixel_width;
1034         unsigned int dst_height = parms.dst_pixel_height;
1035
1036         // YVU
1037         unsigned int src_linestride_y = src_width;
1038         unsigned int src_dbl_linestride_y = src_width<<1;
1039         unsigned int src_linestride_vu = src_width>>1;
1040         unsigned int src_dbl_linestride_vu = src_width;
1041
1042         // scaled YVU
1043         unsigned int scaled_src_linestride_y = dst_width;
1044
1045         // ram addresses
1046         unsigned char* src_addr_y = parms.y_plane;
1047         unsigned char* src_addr_v = parms.v_plane;
1048         unsigned char* src_addr_u = parms.u_plane;
1049
1050         unsigned int dst_picture_size = dst_width*dst_height;
1051
1052         // Sizes for destination
1053         unsigned int dst_dbl_linestride_y = dst_width<<1;
1054         unsigned int dst_dbl_linestride_vu = dst_width>>1;
1055
1056         // Perform address calculation for Y, V and U in main memory with dst_addr as base
1057         unsigned char* dst_addr_main_memory_y = dst_addr;
1058         unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
1059         unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
1060
1061         // calculate scale factors
1062         vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
1063         float y_scale = (float)src_height/(float)dst_height;
1064
1065         // double buffered processing
1066         // buffer switching
1067         unsigned int curr_src_idx = 0;
1068         unsigned int curr_dst_idx = 0;
1069         unsigned int next_src_idx, next_dst_idx;
1070
1071         // 2 lines y as output, upper and lowerline
1072         unsigned int curr_interpl_y_upper = 0;
1073         unsigned int next_interpl_y_upper;
1074         unsigned int curr_interpl_y_lower, next_interpl_y_lower;
1075         // only 1 line v/u output, both planes have the same dimension
1076         unsigned int curr_interpl_vu = 0;
1077         unsigned int next_interpl_vu;
1078
1079         // weights, calculated in every loop iteration
1080         vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
1081         vector float vf_next_NSweight_y_upper;
1082         vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
1083         vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
1084         vector float vf_next_NSweight_vu;
1085
1086         // line indices for the src picture
1087         float curr_src_y_upper = 0.0f, next_src_y_upper;
1088         float curr_src_y_lower, next_src_y_lower;
1089         float curr_src_vu = 0.0f, next_src_vu;
1090
1091         // line indices for the dst picture
1092         unsigned int dst_y=0, dst_vu=0;
1093
1094         // calculate lower line idices
1095         curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
1096         curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
1097         // lower line weight
1098         vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
1099
1100
1101         // start partially double buffered processing
1102         // get initial data, 2 sets of y, 1 set v, 1 set u
1103         mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
1104         mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
1105                         (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
1106                         src_dbl_linestride_y,
1107                         RETR_BUF,
1108                         0, 0 );
1109         mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
1110         mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
1111
1112         // iteration loop
1113         // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
1114         // the scaled output is 2 lines y, 1 line v, 1 line u
1115         // the yuv2rgb-converted output is stored to RAM
1116         for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
1117                 dst_y = dst_vu<<1;
1118
1119                 // calculate next indices
1120                 next_src_vu = ((float)dst_vu+1)*y_scale;
1121                 next_src_y_upper = ((float)dst_y+2)*y_scale;
1122                 next_src_y_lower = ((float)dst_y+3)*y_scale;
1123
1124                 next_interpl_vu = (unsigned int) next_src_vu;
1125                 next_interpl_y_upper = (unsigned int) next_src_y_upper;
1126                 next_interpl_y_lower = (unsigned int) next_src_y_lower;
1127
1128                 // calculate weight NORTH-SOUTH
1129                 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
1130                 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
1131                 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
1132
1133                 // get next lines
1134                 next_src_idx = curr_src_idx^1;
1135                 next_dst_idx = curr_dst_idx^1;
1136
1137                 // 4 lines y
1138                 mfc_get( y_plane[next_src_idx],
1139                                 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
1140                                 src_dbl_linestride_y,
1141                                 RETR_BUF+next_src_idx,
1142                                 0, 0 );
1143                 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
1144                                 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
1145                                 src_dbl_linestride_y,
1146                                 RETR_BUF+next_src_idx,
1147                                 0, 0 );
1148
1149                 // 2 lines v
1150                 mfc_get( v_plane[next_src_idx],
1151                                 (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
1152                                 src_dbl_linestride_vu,
1153                                 RETR_BUF+next_src_idx,
1154                                 0, 0 );
1155                 // 2 lines u
1156                 mfc_get( u_plane[next_src_idx],
1157                                 (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
1158                                 src_dbl_linestride_vu,
1159                                 RETR_BUF+next_src_idx,
1160                                 0, 0 );
1161
1162                 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
1163
1164                 // scaling
1165                 // work line y_upper
1166                 bilinear_scale_line_w16( y_plane[curr_src_idx],
1167                                 scaled_y_plane[curr_src_idx],
1168                                 dst_width,
1169                                 vf_x_scale,
1170                                 vf_curr_NSweight_y_upper,
1171                                 src_linestride_y );
1172                 // work line y_lower
1173                 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
1174                                 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
1175                                 dst_width,
1176                                 vf_x_scale,
1177                                 vf_curr_NSweight_y_lower,
1178                                 src_linestride_y );
1179                 // work line v
1180                 bilinear_scale_line_w16( v_plane[curr_src_idx],
1181                                 scaled_v_plane[curr_src_idx],
1182                                 dst_width>>1,
1183                                 vf_x_scale,
1184                                 vf_curr_NSweight_vu,
1185                                 src_linestride_vu );
1186                 // work line u
1187                 bilinear_scale_line_w16( u_plane[curr_src_idx],
1188                                 scaled_u_plane[curr_src_idx],
1189                                 dst_width>>1,
1190                                 vf_x_scale,
1191                                 vf_curr_NSweight_vu,
1192                                 src_linestride_vu );
1193
1194
1195
1196                 // Store the result back to main memory into a destination buffer in YUV format
1197                 //---------------------------------------------------------------------------------------------
1198                 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1199
1200                 // Perform three DMA transfers to 3 different locations in the main memory!
1201                 // dst_width:   Pixel width of destination image
1202                 // dst_addr:    Destination address in main memory
1203                 // dst_vu:      Counter which is incremented one by one
1204                 // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
1205
1206                 mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
1207                                 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),  // Destination in main memory (addr)
1208                                 dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
1209                                 STR_BUF+curr_dst_idx,                                                           // Tag
1210                                 0, 0 );
1211
1212                 mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
1213                                 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1214                                 dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
1215                                 STR_BUF+curr_dst_idx,                                                           // Tag
1216                                 0, 0 );
1217
1218                 mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
1219                                 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1220                                 dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
1221                                 STR_BUF+curr_dst_idx,                                                           // Tag
1222                                 0, 0 );
1223                 //---------------------------------------------------------------------------------------------
1224
1225
1226                 // update for next cycle
1227                 curr_src_idx = next_src_idx;
1228                 curr_dst_idx = next_dst_idx;
1229
1230                 curr_interpl_y_upper = next_interpl_y_upper;
1231                 curr_interpl_y_lower = next_interpl_y_lower;
1232                 curr_interpl_vu = next_interpl_vu;
1233
1234                 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
1235                 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
1236                 vf_curr_NSweight_vu = vf_next_NSweight_vu;
1237
1238                 curr_src_y_upper = next_src_y_upper;
1239                 curr_src_y_lower = next_src_y_lower;
1240                 curr_src_vu = next_src_vu;
1241         }
1242
1243
1244
1245         DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
1246
1247         // scaling
1248         // work line y_upper
1249         bilinear_scale_line_w16( y_plane[curr_src_idx],
1250                         scaled_y_plane[curr_src_idx],
1251                         dst_width,
1252                         vf_x_scale,
1253                         vf_curr_NSweight_y_upper,
1254                         src_linestride_y );
1255         // work line y_lower
1256         bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
1257                         scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
1258                         dst_width,
1259                         vf_x_scale,
1260                         vf_curr_NSweight_y_lower,
1261                         src_linestride_y );
1262         // work line v
1263         bilinear_scale_line_w16( v_plane[curr_src_idx],
1264                         scaled_v_plane[curr_src_idx],
1265                         dst_width>>1,
1266                         vf_x_scale,
1267                         vf_curr_NSweight_vu,
1268                         src_linestride_vu );
1269         // work line u
1270         bilinear_scale_line_w16( u_plane[curr_src_idx],
1271                         scaled_u_plane[curr_src_idx],
1272                         dst_width>>1,
1273                         vf_x_scale,
1274                         vf_curr_NSweight_vu,
1275                         src_linestride_vu );
1276
1277
1278         // Store the result back to main memory into a destination buffer in YUV format
1279         //---------------------------------------------------------------------------------------------
1280         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1281
1282         // Perform three DMA transfers to 3 different locations in the main memory!
1283         // dst_width:   Pixel width of destination image
1284         // dst_addr:    Destination address in main memory
1285         // dst_vu:      Counter which is incremented one by one
1286         // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
1287
1288         mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
1289                         (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
1290                         dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
1291                         STR_BUF+curr_dst_idx,                                                           // Tag
1292                         0, 0 );
1293
1294         mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
1295                         (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1296                         dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
1297                         STR_BUF+curr_dst_idx,                                                           // Tag
1298                         0, 0 );
1299
1300         mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
1301                         (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1302                         dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
1303                         STR_BUF+curr_dst_idx,                                                           // Tag
1304                         0, 0 );
1305
1306         // wait for completion
1307         DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1308         //---------------------------------------------------------------------------------------------
1309 }
1310
1311
1312 /*
1313  * bilinear_scale_line_w8()
1314  *
1315  * processes a line of yuv-input, width has to be a multiple of 8
1316  * scaled yuv-output is written to local store buffer
1317  *
1318  * @param src buffer for 2 lines input
1319  * @param dst_ buffer for 1 line output
1320  * @param dst_width the width of the destination line
1321  * @param vf_x_scale a float vector, at each entry is the x_scale-factor
1322  * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
1323  * @param src_linestride the stride of the srcline
1324  */
1325 void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
1326
1327         unsigned char* dst = dst_;
1328
1329         unsigned int dst_x;
1330         for( dst_x=0; dst_x<dst_width; dst_x+=8) {
1331                 // address calculation for loading the 4 surrounding pixel of each calculated
1332                 // destination pixel
1333                 vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
1334                 // lower range->first 4 pixel
1335                 // upper range->next 4 pixel
1336                 vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
1337                 vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
1338                 vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
1339                 vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
1340
1341                 // calculate weight EAST-WEST
1342                 vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
1343                 vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
1344                 vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
1345                 vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
1346                 vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
1347                 vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
1348                 vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
1349                 vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
1350                 vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
1351                 vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
1352
1353                 // calculate address offset
1354                 //
1355                 // pixel NORTH WEST
1356                 vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
1357                 vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
1358
1359                 // pixel NORTH EAST-->(offpixelNW+1)
1360                 vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
1361                 vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
1362                 vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
1363
1364                 // SOUTH-WEST-->(offpixelNW+src_linestride)
1365                 vector unsigned int vui_srclinestride = spu_splats( src_linestride );
1366                 vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
1367                 vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
1368
1369                 // SOUTH-EAST-->(offpixelNW+src_linestride+1)
1370                 vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
1371                 vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
1372
1373                 // calculate each address
1374                 vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
1375                 vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
1376                 vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
1377                 vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
1378                 vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
1379
1380                 vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
1381                 vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
1382                 vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
1383                 vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
1384
1385                 // get each pixel
1386                 //
1387                 // scalar load, afterwards insertion into the right position
1388                 // NORTH WEST
1389                 vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
1390                 vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
1391                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
1392                 vuc_pixel_NW_lower_range = spu_insert(
1393                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
1394                                 vuc_pixel_NW_lower_range, 7 );
1395                 vuc_pixel_NW_lower_range = spu_insert(
1396                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
1397                                 vuc_pixel_NW_lower_range, 11 );
1398                 vuc_pixel_NW_lower_range = spu_insert(
1399                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
1400                                 vuc_pixel_NW_lower_range, 15 );
1401
1402                 vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
1403                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
1404                 vuc_pixel_NW_upper_range = spu_insert(
1405                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
1406                                 vuc_pixel_NW_upper_range, 7 );
1407                 vuc_pixel_NW_upper_range = spu_insert(
1408                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
1409                                 vuc_pixel_NW_upper_range, 11 );
1410                 vuc_pixel_NW_upper_range = spu_insert(
1411                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
1412                                 vuc_pixel_NW_upper_range, 15 );
1413
1414                 // NORTH EAST
1415                 vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
1416                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
1417                 vuc_pixel_NE_lower_range = spu_insert(
1418                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
1419                                 vuc_pixel_NE_lower_range, 7 );
1420                 vuc_pixel_NE_lower_range = spu_insert(
1421                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
1422                                 vuc_pixel_NE_lower_range, 11 );
1423                 vuc_pixel_NE_lower_range = spu_insert(
1424                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
1425                                 vuc_pixel_NE_lower_range, 15 );
1426
1427                 vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
1428                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
1429                 vuc_pixel_NE_upper_range = spu_insert(
1430                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
1431                                 vuc_pixel_NE_upper_range, 7 );
1432                 vuc_pixel_NE_upper_range = spu_insert(
1433                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
1434                                 vuc_pixel_NE_upper_range, 11 );
1435                 vuc_pixel_NE_upper_range = spu_insert(
1436                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
1437                                 vuc_pixel_NE_upper_range, 15 );
1438
1439
1440                 // SOUTH WEST
1441                 vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
1442                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
1443                 vuc_pixel_SW_lower_range = spu_insert(
1444                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
1445                                 vuc_pixel_SW_lower_range, 7 );
1446                 vuc_pixel_SW_lower_range = spu_insert(
1447                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
1448                                 vuc_pixel_SW_lower_range, 11 );
1449                 vuc_pixel_SW_lower_range = spu_insert(
1450                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
1451                                 vuc_pixel_SW_lower_range, 15 );
1452
1453                 vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
1454                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
1455                 vuc_pixel_SW_upper_range = spu_insert(
1456                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
1457                                 vuc_pixel_SW_upper_range, 7 );
1458                 vuc_pixel_SW_upper_range = spu_insert(
1459                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
1460                                 vuc_pixel_SW_upper_range, 11 );
1461                 vuc_pixel_SW_upper_range = spu_insert(
1462                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
1463                                 vuc_pixel_SW_upper_range, 15 );
1464
1465                 // SOUTH EAST
1466                 vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
1467                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
1468                 vuc_pixel_SE_lower_range = spu_insert(
1469                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
1470                                 vuc_pixel_SE_lower_range, 7 );
1471                 vuc_pixel_SE_lower_range = spu_insert(
1472                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
1473                                 vuc_pixel_SE_lower_range, 11 );
1474                 vuc_pixel_SE_lower_range = spu_insert(
1475                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
1476                                 vuc_pixel_SE_lower_range, 15 );
1477
1478                 vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
1479                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
1480                 vuc_pixel_SE_upper_range = spu_insert(
1481                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
1482                                 vuc_pixel_SE_upper_range, 7 );
1483                 vuc_pixel_SE_upper_range = spu_insert(
1484                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
1485                                 vuc_pixel_SE_upper_range, 11 );
1486                 vuc_pixel_SE_upper_range = spu_insert(
1487                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
1488                                 vuc_pixel_SE_upper_range, 15 );
1489
1490
1491                 // convert to float
1492                 vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
1493                 vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
1494
1495                 vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
1496                 vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
1497
1498                 vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
1499                 vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
1500
1501                 vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
1502                 vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
1503
1504
1505
1506                 // first linear interpolation: EWtop
1507                 // EWtop = NW + EWweight*(NE-NW)
1508                 //
1509                 // lower range
1510                 vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
1511                 vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
1512                                                                 vf_EWtop_lower_range_tmp,
1513                                                                 vf_pixel_NW_lower_range );
1514
1515                 // upper range
1516                 vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
1517                 vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
1518                                                                 vf_EWtop_upper_range_tmp,
1519                                                                 vf_pixel_NW_upper_range );
1520
1521
1522
1523                 // second linear interpolation: EWbottom
1524                 // EWbottom = SW + EWweight*(SE-SW)
1525                 //
1526                 // lower range
1527                 vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
1528                 vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
1529                                                                 vf_EWbottom_lower_range_tmp,
1530                                                                 vf_pixel_SW_lower_range );
1531
1532                 // upper range
1533                 vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
1534                 vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
1535                                                                 vf_EWbottom_upper_range_tmp,
1536                                                                 vf_pixel_SW_upper_range );
1537
1538
1539
1540                 // third linear interpolation: the bilinear interpolated value
1541                 // result = EWtop + NSweight*(EWbottom-EWtop);
1542                 //
1543                 // lower range
1544                 vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
1545                 vector float vf_result_lower_range = spu_madd( vf_NSweight,
1546                                                                 vf_result_lower_range_tmp,
1547                                                                 vf_EWtop_lower_range );
1548
1549                 // upper range
1550                 vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
1551                 vector float vf_result_upper_range = spu_madd( vf_NSweight,
1552                                                                 vf_result_upper_range_tmp,
1553                                                                 vf_EWtop_upper_range );
1554
1555
1556                 // convert back: using saturated arithmetic
1557                 vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
1558                 vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
1559
1560                 // merge results->lower,upper
1561                 vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
1562                                                                0x13, 0x17, 0x1B, 0x1F,
1563                                                                0x00, 0x00, 0x00, 0x00,
1564                                                                0x00, 0x00, 0x00, 0x00 };
1565
1566                 vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
1567                                                                 (vector unsigned char) vui_result_upper_range,
1568                                                                 vuc_mask_merge_result );
1569
1570                 // partial storing
1571                 vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
1572                                                       0x00, 0x00, 0x00, 0x00,
1573                                                       0xFF, 0xFF, 0xFF, 0xFF,
1574                                                       0xFF, 0xFF, 0xFF, 0xFF };
1575
1576
1577                 // get currently stored data
1578                 vector unsigned char vuc_orig = *((vector unsigned char*)dst);
1579
1580                 // clear currently stored data
1581                 vuc_orig = spu_and( vuc_orig,
1582                                 spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
1583
1584                 // rotate result according to storing address
1585                 vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
1586
1587                 // store result
1588                 *((vector unsigned char*)dst) = spu_or( vuc_result,
1589                                                         vuc_orig );
1590                 dst += 8;
1591         }
1592 }
1593
1594
1595 /*
1596  * bilinear_scale_line_w16()
1597  *
1598  * processes a line of yuv-input, width has to be a multiple of 16
1599  * scaled yuv-output is written to local store buffer
1600  *
1601  * @param src buffer for 2 lines input
1602  * @param dst_ buffer for 1 line output
1603  * @param dst_width the width of the destination line
1604  * @param vf_x_scale a float vector, at each entry is the x_scale-factor
1605  * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
1606  * @param src_linestride the stride of the srcline
1607  */
1608 void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
1609
1610         unsigned char* dst = dst_;
1611
1612         unsigned int dst_x;
1613         for( dst_x=0; dst_x<dst_width; dst_x+=16) {
1614                 // address calculation for loading the 4 surrounding pixel of each calculated
1615                 // destination pixel
1616                 vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
1617                 // parallelised processing
1618                 // first range->pixel 1 2 3 4
1619                 // second range->pixel 5 6 7 8
1620                 // third range->pixel 9 10 11 12
1621                 // fourth range->pixel 13 14 15 16
1622                 vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
1623                 vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
1624                 vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
1625                 vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
1626                 vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
1627                 vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
1628                 vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
1629                 vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
1630
1631                 // calculate weight EAST-WEST
1632                 vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
1633                 vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
1634                 vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
1635                 vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
1636                 vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
1637                 vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
1638                 vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
1639                 vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
1640                 vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
1641                 vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
1642                 vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
1643                 vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
1644                 vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
1645                 vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
1646                 vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
1647                 vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
1648                 vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
1649                 vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
1650                 vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
1651                 vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
1652
1653                 // calculate address offset
1654                 //
1655                 // pixel NORTH WEST
1656                 vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
1657                 vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
1658                 vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
1659                 vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
1660
1661                 // pixel NORTH EAST-->(offpixelNW+1)
1662                 vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
1663                 vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
1664                 vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
1665                 vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
1666                 vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
1667
1668                 // SOUTH-WEST-->(offpixelNW+src_linestride)
1669                 vector unsigned int vui_srclinestride = spu_splats( src_linestride );
1670                 vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
1671                 vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
1672                 vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
1673                 vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
1674
1675                 // SOUTH-EAST-->(offpixelNW+src_linestride+1)
1676                 vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
1677                 vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
1678                 vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
1679                 vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
1680
1681                 // calculate each address
1682                 vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
1683                 vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
1684                 vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
1685                 vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
1686                 vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
1687
1688                 vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
1689                 vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
1690                 vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
1691                 vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
1692
1693                 vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
1694                 vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
1695                 vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
1696                 vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
1697
1698                 vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
1699                 vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
1700                 vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
1701                 vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
1702
1703
1704                 // get each pixel
1705                 //
1706                 // scalar load, afterwards insertion into the right position
1707                 // NORTH WEST
1708                 // first range
1709                 vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
1710                 vector unsigned char vuc_pixel_NW_first_range = spu_insert(
1711                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
1712                 vuc_pixel_NW_first_range = spu_insert(
1713                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
1714                                 vuc_pixel_NW_first_range, 7 );
1715                 vuc_pixel_NW_first_range = spu_insert(
1716                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
1717                                 vuc_pixel_NW_first_range, 11 );
1718                 vuc_pixel_NW_first_range = spu_insert(
1719                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
1720                                 vuc_pixel_NW_first_range, 15 );
1721                 // second range
1722                 vector unsigned char vuc_pixel_NW_second_range = spu_insert(
1723                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
1724                 vuc_pixel_NW_second_range = spu_insert(
1725                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
1726                                 vuc_pixel_NW_second_range, 7 );
1727                 vuc_pixel_NW_second_range = spu_insert(
1728                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
1729                                 vuc_pixel_NW_second_range, 11 );
1730                 vuc_pixel_NW_second_range = spu_insert(
1731                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
1732                                 vuc_pixel_NW_second_range, 15 );
1733                 // third range
1734                 vector unsigned char vuc_pixel_NW_third_range = spu_insert(
1735                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
1736                 vuc_pixel_NW_third_range = spu_insert(
1737                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
1738                                 vuc_pixel_NW_third_range, 7 );
1739                 vuc_pixel_NW_third_range = spu_insert(
1740                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
1741                                 vuc_pixel_NW_third_range, 11 );
1742                 vuc_pixel_NW_third_range = spu_insert(
1743                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
1744                                 vuc_pixel_NW_third_range, 15 );
1745                 // fourth range
1746                 vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
1747                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
1748                 vuc_pixel_NW_fourth_range = spu_insert(
1749                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
1750                                 vuc_pixel_NW_fourth_range, 7 );
1751                 vuc_pixel_NW_fourth_range = spu_insert(
1752                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
1753                                 vuc_pixel_NW_fourth_range, 11 );
1754                 vuc_pixel_NW_fourth_range = spu_insert(
1755                                 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
1756                                 vuc_pixel_NW_fourth_range, 15 );
1757
1758                 // NORTH EAST
1759                 // first range
1760                 vector unsigned char vuc_pixel_NE_first_range = spu_insert(
1761                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
1762                 vuc_pixel_NE_first_range = spu_insert(
1763                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
1764                                 vuc_pixel_NE_first_range, 7 );
1765                 vuc_pixel_NE_first_range = spu_insert(
1766                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
1767                                 vuc_pixel_NE_first_range, 11 );
1768                 vuc_pixel_NE_first_range = spu_insert(
1769                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
1770                                 vuc_pixel_NE_first_range, 15 );
1771                 // second range
1772                 vector unsigned char vuc_pixel_NE_second_range = spu_insert(
1773                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
1774                 vuc_pixel_NE_second_range = spu_insert(
1775                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
1776                                 vuc_pixel_NE_second_range, 7 );
1777                 vuc_pixel_NE_second_range = spu_insert(
1778                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
1779                                 vuc_pixel_NE_second_range, 11 );
1780                 vuc_pixel_NE_second_range = spu_insert(
1781                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
1782                                 vuc_pixel_NE_second_range, 15 );
1783                 // third range
1784                 vector unsigned char vuc_pixel_NE_third_range = spu_insert(
1785                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
1786                 vuc_pixel_NE_third_range = spu_insert(
1787                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
1788                                 vuc_pixel_NE_third_range, 7 );
1789                 vuc_pixel_NE_third_range = spu_insert(
1790                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
1791                                 vuc_pixel_NE_third_range, 11 );
1792                 vuc_pixel_NE_third_range = spu_insert(
1793                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
1794                                 vuc_pixel_NE_third_range, 15 );
1795                 // fourth range
1796                 vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
1797                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
1798                 vuc_pixel_NE_fourth_range = spu_insert(
1799                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
1800                                 vuc_pixel_NE_fourth_range, 7 );
1801                 vuc_pixel_NE_fourth_range = spu_insert(
1802                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
1803                                 vuc_pixel_NE_fourth_range, 11 );
1804                 vuc_pixel_NE_fourth_range = spu_insert(
1805                                 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
1806                                 vuc_pixel_NE_fourth_range, 15 );
1807
1808                 // SOUTH WEST
1809                 // first range
1810                 vector unsigned char vuc_pixel_SW_first_range = spu_insert(
1811                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
1812                 vuc_pixel_SW_first_range = spu_insert(
1813                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
1814                                 vuc_pixel_SW_first_range, 7 );
1815                 vuc_pixel_SW_first_range = spu_insert(
1816                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
1817                                 vuc_pixel_SW_first_range, 11 );
1818                 vuc_pixel_SW_first_range = spu_insert(
1819                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
1820                                 vuc_pixel_SW_first_range, 15 );
1821                 // second range
1822                 vector unsigned char vuc_pixel_SW_second_range = spu_insert(
1823                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
1824                 vuc_pixel_SW_second_range = spu_insert(
1825                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
1826                                 vuc_pixel_SW_second_range, 7 );
1827                 vuc_pixel_SW_second_range = spu_insert(
1828                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
1829                                 vuc_pixel_SW_second_range, 11 );
1830                 vuc_pixel_SW_second_range = spu_insert(
1831                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
1832                                 vuc_pixel_SW_second_range, 15 );
1833                 // third range
1834                 vector unsigned char vuc_pixel_SW_third_range = spu_insert(
1835                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
1836                 vuc_pixel_SW_third_range = spu_insert(
1837                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
1838                                 vuc_pixel_SW_third_range, 7 );
1839                 vuc_pixel_SW_third_range = spu_insert(
1840                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
1841                                 vuc_pixel_SW_third_range, 11 );
1842                 vuc_pixel_SW_third_range = spu_insert(
1843                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
1844                                 vuc_pixel_SW_third_range, 15 );
1845                 // fourth range
1846                 vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
1847                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
1848                 vuc_pixel_SW_fourth_range = spu_insert(
1849                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
1850                                 vuc_pixel_SW_fourth_range, 7 );
1851                 vuc_pixel_SW_fourth_range = spu_insert(
1852                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
1853                                 vuc_pixel_SW_fourth_range, 11 );
1854                 vuc_pixel_SW_fourth_range = spu_insert(
1855                                 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
1856                                 vuc_pixel_SW_fourth_range, 15 );
1857
1858                 // NORTH EAST
1859                 // first range
1860                 vector unsigned char vuc_pixel_SE_first_range = spu_insert(
1861                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
1862                 vuc_pixel_SE_first_range = spu_insert(
1863                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
1864                                 vuc_pixel_SE_first_range, 7 );
1865                 vuc_pixel_SE_first_range = spu_insert(
1866                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
1867                                 vuc_pixel_SE_first_range, 11 );
1868                 vuc_pixel_SE_first_range = spu_insert(
1869                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
1870                                 vuc_pixel_SE_first_range, 15 );
1871                 // second range
1872                 vector unsigned char vuc_pixel_SE_second_range = spu_insert(
1873                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
1874                 vuc_pixel_SE_second_range = spu_insert(
1875                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
1876                                 vuc_pixel_SE_second_range, 7 );
1877                 vuc_pixel_SE_second_range = spu_insert(
1878                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
1879                                 vuc_pixel_SE_second_range, 11 );
1880                 vuc_pixel_SE_second_range = spu_insert(
1881                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
1882                                 vuc_pixel_SE_second_range, 15 );
1883                 // third range
1884                 vector unsigned char vuc_pixel_SE_third_range = spu_insert(
1885                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
1886                 vuc_pixel_SE_third_range = spu_insert(
1887                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
1888                                 vuc_pixel_SE_third_range, 7 );
1889                 vuc_pixel_SE_third_range = spu_insert(
1890                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
1891                                 vuc_pixel_SE_third_range, 11 );
1892                 vuc_pixel_SE_third_range = spu_insert(
1893                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
1894                                 vuc_pixel_SE_third_range, 15 );
1895                 // fourth range
1896                 vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
1897                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
1898                 vuc_pixel_SE_fourth_range = spu_insert(
1899                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
1900                                 vuc_pixel_SE_fourth_range, 7 );
1901                 vuc_pixel_SE_fourth_range = spu_insert(
1902                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
1903                                 vuc_pixel_SE_fourth_range, 11 );
1904                 vuc_pixel_SE_fourth_range = spu_insert(
1905                                 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
1906                                 vuc_pixel_SE_fourth_range, 15 );
1907
1908
1909
1910                 // convert to float
1911                 vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
1912                 vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
1913                 vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
1914                 vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
1915
1916                 vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
1917                 vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
1918                 vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
1919                 vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
1920
1921                 vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
1922                 vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
1923                 vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
1924                 vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
1925
1926                 vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
1927                 vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
1928                 vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
1929                 vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
1930
1931                 // first linear interpolation: EWtop
1932                 // EWtop = NW + EWweight*(NE-NW)
1933                 //
1934                 // first range
1935                 vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
1936                 vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
1937                                                                 vf_EWtop_first_range_tmp,
1938                                                                 vf_pixel_NW_first_range );
1939
1940                 // second range
1941                 vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
1942                 vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
1943                                                                 vf_EWtop_second_range_tmp,
1944                                                                 vf_pixel_NW_second_range );
1945
1946                 // third range
1947                 vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
1948                 vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
1949                                                                 vf_EWtop_third_range_tmp,
1950                                                                 vf_pixel_NW_third_range );
1951
1952                 // fourth range
1953                 vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
1954                 vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
1955                                                                 vf_EWtop_fourth_range_tmp,
1956                                                                 vf_pixel_NW_fourth_range );
1957
1958
1959
1960                 // second linear interpolation: EWbottom
1961                 // EWbottom = SW + EWweight*(SE-SW)
1962                 //
1963                 // first range
1964                 vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
1965                 vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
1966                                                                 vf_EWbottom_first_range_tmp,
1967                                                                 vf_pixel_SW_first_range );
1968
1969                 // second range
1970                 vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
1971                 vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
1972                                                                 vf_EWbottom_second_range_tmp,
1973                                                                 vf_pixel_SW_second_range );
1974                 // first range
1975                 vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
1976                 vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
1977                                                                 vf_EWbottom_third_range_tmp,
1978                                                                 vf_pixel_SW_third_range );
1979
1980                 // first range
1981                 vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
1982                 vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
1983                                                                 vf_EWbottom_fourth_range_tmp,
1984                                                                 vf_pixel_SW_fourth_range );
1985
1986
1987
1988                 // third linear interpolation: the bilinear interpolated value
1989                 // result = EWtop + NSweight*(EWbottom-EWtop);
1990                 //
1991                 // first range
1992                 vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
1993                 vector float vf_result_first_range = spu_madd( vf_NSweight,
1994                                                                 vf_result_first_range_tmp,
1995                                                                 vf_EWtop_first_range );
1996
1997                 // second range
1998                 vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
1999                 vector float vf_result_second_range = spu_madd( vf_NSweight,
2000                                                                 vf_result_second_range_tmp,
2001                                                                 vf_EWtop_second_range );
2002
2003                 // third range
2004                 vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
2005                 vector float vf_result_third_range = spu_madd( vf_NSweight,
2006                                                                 vf_result_third_range_tmp,
2007                                                                 vf_EWtop_third_range );
2008
2009                 // fourth range
2010                 vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
2011                 vector float vf_result_fourth_range = spu_madd( vf_NSweight,
2012                                                                 vf_result_fourth_range_tmp,
2013                                                                 vf_EWtop_fourth_range );
2014
2015
2016
2017                 // convert back: using saturated arithmetic
2018                 vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
2019                 vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
2020                 vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
2021                 vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
2022
2023                 // merge results->lower,upper
2024                 vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
2025                                                                             0x13, 0x17, 0x1B, 0x1F,
2026                                                                             0x00, 0x00, 0x00, 0x00,
2027                                                                             0x00, 0x00, 0x00, 0x00 };
2028
2029                 vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
2030                                                                             0x00, 0x00, 0x00, 0x00,
2031                                                                             0x03, 0x07, 0x0B, 0x0F,
2032                                                                             0x13, 0x17, 0x1B, 0x1F };
2033
2034                 vector unsigned char vuc_result_first_second =
2035                                                 spu_shuffle( (vector unsigned char) vui_result_first_range,
2036                                                                  (vector unsigned char) vui_result_second_range,
2037                                                                 vuc_mask_merge_result_first_second );
2038
2039                 vector unsigned char vuc_result_third_fourth =
2040                                                 spu_shuffle( (vector unsigned char) vui_result_third_range,
2041                                                                  (vector unsigned char) vui_result_fourth_range,
2042                                                                 vuc_mask_merge_result_third_fourth );
2043
2044                 // store result
2045                 *((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
2046                                                         vuc_result_third_fourth );
2047                 dst += 16;
2048         }
2049 }
2050