2 * SDL - Simple DirectMedia Layer
3 * CELL BE Support for PS3 Framebuffer
4 * Copyright (C) 2008, 2009 International Business Machines Corporation
6 * This library is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU Lesser General Public License as published
8 * by the Free Software Foundation; either version 2.1 of the License, or
9 * (at your option) any later version.
11 * This library is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com>
22 * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
23 * SPE code based on research by:
28 #include "spu_common.h"
30 #include <spu_intrinsics.h>
31 #include <spu_mfcio.h>
37 #define deprintf(fmt, args... ) \
38 fprintf( stdout, fmt, ##args ); \
41 #define deprintf( fmt, args... )
44 struct scale_parms_t parms __attribute__((aligned(128)));
46 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
47 * there might be the need to retrieve misaligned data, adjust
48 * incoming v and u plane to be able to handle this (add 128)
50 unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
51 unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
52 unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
54 /* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
55 unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
56 unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
57 unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
59 /* some vectors needed by the float to int conversion */
60 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
61 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
63 void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
64 void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
66 void scale_srcw16_dstw16();
67 void scale_srcw16_dstw32();
68 void scale_srcw32_dstw16();
69 void scale_srcw32_dstw32();
71 int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
73 deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
74 /* DMA transfer for the input parameters */
75 spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
76 DMA_WAIT_TAG(TAG_INIT);
78 deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
79 parms.dst_pixel_width, parms.dst_pixel_height);
81 if(parms.src_pixel_width & 0x1f) {
82 if(parms.dst_pixel_width & 0x1F) {
83 deprintf("[SPU] Using scale_srcw16_dstw16\n");
84 scale_srcw16_dstw16();
86 deprintf("[SPU] Using scale_srcw16_dstw32\n");
87 scale_srcw16_dstw32();
90 if(parms.dst_pixel_width & 0x1F) {
91 deprintf("[SPU] Using scale_srcw32_dstw16\n");
92 scale_srcw32_dstw16();
94 deprintf("[SPU] Using scale_srcw32_dstw32\n");
95 scale_srcw32_dstw32();
98 deprintf("[SPU] bilin_scaler_spu... done!\n");
107 * converts a float vector to an unsinged int vector using saturated
110 * @param vec_s float vector for conversion
111 * @returns converted unsigned int vector
113 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
114 vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
115 vec_s = spu_sel(vec_s, vec_0_1, select_1);
117 vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
118 vec_s = spu_sel(vec_s, vec_255, select_2);
119 return spu_convtu(vec_s,0);
124 * scale_srcw16_dstw16()
126 * processes an input image of width 16
127 * scaling is done to a width 16
128 * result stored in RAM
130 void scale_srcw16_dstw16() {
131 // extract parameters
132 unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
134 unsigned int src_width = parms.src_pixel_width;
135 unsigned int src_height = parms.src_pixel_height;
136 unsigned int dst_width = parms.dst_pixel_width;
137 unsigned int dst_height = parms.dst_pixel_height;
140 unsigned int src_linestride_y = src_width;
141 unsigned int src_dbl_linestride_y = src_width<<1;
142 unsigned int src_linestride_vu = src_width>>1;
143 unsigned int src_dbl_linestride_vu = src_width;
146 unsigned int scaled_src_linestride_y = dst_width;
149 unsigned char* src_addr_y = parms.y_plane;
150 unsigned char* src_addr_v = parms.v_plane;
151 unsigned char* src_addr_u = parms.u_plane;
153 // for handling misalignment, addresses are precalculated
154 unsigned char* precalc_src_addr_v = src_addr_v;
155 unsigned char* precalc_src_addr_u = src_addr_u;
157 unsigned int dst_picture_size = dst_width*dst_height;
159 // Sizes for destination
160 unsigned int dst_dbl_linestride_y = dst_width<<1;
161 unsigned int dst_dbl_linestride_vu = dst_width>>1;
163 // Perform address calculation for Y, V and U in main memory with dst_addr as base
164 unsigned char* dst_addr_main_memory_y = dst_addr;
165 unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
166 unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
168 // calculate scale factors
169 vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
170 float y_scale = (float)src_height/(float)dst_height;
172 // double buffered processing
174 unsigned int curr_src_idx = 0;
175 unsigned int curr_dst_idx = 0;
176 unsigned int next_src_idx, next_dst_idx;
178 // 2 lines y as output, upper and lowerline
179 unsigned int curr_interpl_y_upper = 0;
180 unsigned int next_interpl_y_upper;
181 unsigned int curr_interpl_y_lower, next_interpl_y_lower;
182 // only 1 line v/u output, both planes have the same dimension
183 unsigned int curr_interpl_vu = 0;
184 unsigned int next_interpl_vu;
186 // weights, calculated in every loop iteration
187 vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
188 vector float vf_next_NSweight_y_upper;
189 vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
190 vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
191 vector float vf_next_NSweight_vu;
193 // line indices for the src picture
194 float curr_src_y_upper = 0.0f, next_src_y_upper;
195 float curr_src_y_lower, next_src_y_lower;
196 float curr_src_vu = 0.0f, next_src_vu;
198 // line indices for the dst picture
199 unsigned int dst_y=0, dst_vu=0;
201 // offset for the v and u plane to handle misalignement
202 unsigned int curr_lsoff_v = 0, next_lsoff_v;
203 unsigned int curr_lsoff_u = 0, next_lsoff_u;
205 // calculate lower line indices
206 curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
207 curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
209 vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
212 // start partially double buffered processing
213 // get initial data, 2 sets of y, 1 set v, 1 set u
214 mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
215 mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
216 (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
217 src_dbl_linestride_y,
220 mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
221 mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
224 * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
225 * the scaled output is 2 lines y, 1 line v, 1 line u
226 * the yuv2rgb-converted output is stored to RAM
228 for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
231 // calculate next indices
232 next_src_vu = ((float)dst_vu+1)*y_scale;
233 next_src_y_upper = ((float)dst_y+2)*y_scale;
234 next_src_y_lower = ((float)dst_y+3)*y_scale;
236 next_interpl_vu = (unsigned int) next_src_vu;
237 next_interpl_y_upper = (unsigned int) next_src_y_upper;
238 next_interpl_y_lower = (unsigned int) next_src_y_lower;
240 // calculate weight NORTH-SOUTH
241 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
242 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
243 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
246 next_src_idx = curr_src_idx^1;
247 next_dst_idx = curr_dst_idx^1;
250 mfc_get( y_plane[next_src_idx],
251 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
252 src_dbl_linestride_y,
253 RETR_BUF+next_src_idx,
255 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
256 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
257 src_dbl_linestride_y,
258 RETR_BUF+next_src_idx,
262 precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
263 next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
264 mfc_get( v_plane[next_src_idx],
265 ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
266 src_dbl_linestride_vu+(next_lsoff_v<<1),
267 RETR_BUF+next_src_idx,
270 precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
271 next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
272 mfc_get( u_plane[next_src_idx],
273 ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
274 src_dbl_linestride_vu+(next_lsoff_v<<1),
275 RETR_BUF+next_src_idx,
278 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
282 bilinear_scale_line_w16( y_plane[curr_src_idx],
283 scaled_y_plane[curr_src_idx],
286 vf_curr_NSweight_y_upper,
289 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
290 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
293 vf_curr_NSweight_y_lower,
296 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
297 scaled_v_plane[curr_src_idx],
303 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
304 scaled_u_plane[curr_src_idx],
311 // Store the result back to main memory into a destination buffer in YUV format
312 //---------------------------------------------------------------------------------------------
313 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
315 // Perform three DMA transfers to 3 different locations in the main memory!
316 // dst_width: Pixel width of destination image
317 // dst_addr: Destination address in main memory
318 // dst_vu: Counter which is incremented one by one
319 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
320 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
321 (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
322 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
323 STR_BUF+curr_dst_idx, // Tag
326 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
327 (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
328 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
329 STR_BUF+curr_dst_idx, // Tag
332 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
333 (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
334 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
335 STR_BUF+curr_dst_idx, // Tag
337 //---------------------------------------------------------------------------------------------
340 // update for next cycle
341 curr_src_idx = next_src_idx;
342 curr_dst_idx = next_dst_idx;
344 curr_interpl_y_upper = next_interpl_y_upper;
345 curr_interpl_y_lower = next_interpl_y_lower;
346 curr_interpl_vu = next_interpl_vu;
348 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
349 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
350 vf_curr_NSweight_vu = vf_next_NSweight_vu;
352 curr_src_y_upper = next_src_y_upper;
353 curr_src_y_lower = next_src_y_lower;
354 curr_src_vu = next_src_vu;
356 curr_lsoff_v = next_lsoff_v;
357 curr_lsoff_u = next_lsoff_u;
362 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
366 bilinear_scale_line_w16( y_plane[curr_src_idx],
367 scaled_y_plane[curr_src_idx],
370 vf_curr_NSweight_y_upper,
373 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
374 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
377 vf_curr_NSweight_y_lower,
380 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
381 scaled_v_plane[curr_src_idx],
387 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
388 scaled_u_plane[curr_src_idx],
395 // Store the result back to main memory into a destination buffer in YUV format
396 //---------------------------------------------------------------------------------------------
397 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
399 // Perform three DMA transfers to 3 different locations in the main memory!
400 // dst_width: Pixel width of destination image
401 // dst_addr: Destination address in main memory
402 // dst_vu: Counter which is incremented one by one
403 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
404 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
405 (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
406 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
407 STR_BUF+curr_dst_idx, // Tag
410 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
411 (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
412 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
413 STR_BUF+curr_dst_idx, // Tag
416 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
417 (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
418 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
419 STR_BUF+curr_dst_idx, // Tag
422 // wait for completion
423 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
424 //---------------------------------------------------------------------------------------------
429 * scale_srcw16_dstw32()
431 * processes an input image of width 16
432 * scaling is done to a width 32
433 * yuv2rgb conversion on a width of 32
434 * result stored in RAM
436 void scale_srcw16_dstw32() {
437 // extract parameters
438 unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
440 unsigned int src_width = parms.src_pixel_width;
441 unsigned int src_height = parms.src_pixel_height;
442 unsigned int dst_width = parms.dst_pixel_width;
443 unsigned int dst_height = parms.dst_pixel_height;
446 unsigned int src_linestride_y = src_width;
447 unsigned int src_dbl_linestride_y = src_width<<1;
448 unsigned int src_linestride_vu = src_width>>1;
449 unsigned int src_dbl_linestride_vu = src_width;
451 unsigned int scaled_src_linestride_y = dst_width;
454 unsigned char* src_addr_y = parms.y_plane;
455 unsigned char* src_addr_v = parms.v_plane;
456 unsigned char* src_addr_u = parms.u_plane;
458 unsigned int dst_picture_size = dst_width*dst_height;
460 // Sizes for destination
461 unsigned int dst_dbl_linestride_y = dst_width<<1;
462 unsigned int dst_dbl_linestride_vu = dst_width>>1;
464 // Perform address calculation for Y, V and U in main memory with dst_addr as base
465 unsigned char* dst_addr_main_memory_y = dst_addr;
466 unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
467 unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
470 // for handling misalignment, addresses are precalculated
471 unsigned char* precalc_src_addr_v = src_addr_v;
472 unsigned char* precalc_src_addr_u = src_addr_u;
474 // calculate scale factors
475 vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
476 float y_scale = (float)src_height/(float)dst_height;
478 // double buffered processing
480 unsigned int curr_src_idx = 0;
481 unsigned int curr_dst_idx = 0;
482 unsigned int next_src_idx, next_dst_idx;
484 // 2 lines y as output, upper and lowerline
485 unsigned int curr_interpl_y_upper = 0;
486 unsigned int next_interpl_y_upper;
487 unsigned int curr_interpl_y_lower, next_interpl_y_lower;
488 // only 1 line v/u output, both planes have the same dimension
489 unsigned int curr_interpl_vu = 0;
490 unsigned int next_interpl_vu;
492 // weights, calculated in every loop iteration
493 vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
494 vector float vf_next_NSweight_y_upper;
495 vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
496 vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
497 vector float vf_next_NSweight_vu;
499 // line indices for the src picture
500 float curr_src_y_upper = 0.0f, next_src_y_upper;
501 float curr_src_y_lower, next_src_y_lower;
502 float curr_src_vu = 0.0f, next_src_vu;
504 // line indices for the dst picture
505 unsigned int dst_y=0, dst_vu=0;
507 // offset for the v and u plane to handle misalignement
508 unsigned int curr_lsoff_v = 0, next_lsoff_v;
509 unsigned int curr_lsoff_u = 0, next_lsoff_u;
511 // calculate lower line idices
512 curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
513 curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
515 vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
518 // start partially double buffered processing
519 // get initial data, 2 sets of y, 1 set v, 1 set u
520 mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
521 mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
522 (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
523 src_dbl_linestride_y,
526 mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
527 mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
530 // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
531 // the scaled output is 2 lines y, 1 line v, 1 line u
532 // the yuv2rgb-converted output is stored to RAM
533 for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
536 // calculate next indices
537 next_src_vu = ((float)dst_vu+1)*y_scale;
538 next_src_y_upper = ((float)dst_y+2)*y_scale;
539 next_src_y_lower = ((float)dst_y+3)*y_scale;
541 next_interpl_vu = (unsigned int) next_src_vu;
542 next_interpl_y_upper = (unsigned int) next_src_y_upper;
543 next_interpl_y_lower = (unsigned int) next_src_y_lower;
545 // calculate weight NORTH-SOUTH
546 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
547 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
548 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
551 next_src_idx = curr_src_idx^1;
552 next_dst_idx = curr_dst_idx^1;
555 mfc_get( y_plane[next_src_idx],
556 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
557 src_dbl_linestride_y,
558 RETR_BUF+next_src_idx,
560 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
561 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
562 src_dbl_linestride_y,
563 RETR_BUF+next_src_idx,
567 precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
568 next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
569 mfc_get( v_plane[next_src_idx],
570 ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
571 src_dbl_linestride_vu+(next_lsoff_v<<1),
572 RETR_BUF+next_src_idx,
575 precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
576 next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
577 mfc_get( u_plane[next_src_idx],
578 ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
579 src_dbl_linestride_vu+(next_lsoff_v<<1),
580 RETR_BUF+next_src_idx,
583 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
587 bilinear_scale_line_w16( y_plane[curr_src_idx],
588 scaled_y_plane[curr_src_idx],
591 vf_curr_NSweight_y_upper,
594 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
595 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
598 vf_curr_NSweight_y_lower,
601 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
602 scaled_v_plane[curr_src_idx],
608 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
609 scaled_u_plane[curr_src_idx],
615 //---------------------------------------------------------------------------------------------
616 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
618 // Perform three DMA transfers to 3 different locations in the main memory!
619 // dst_width: Pixel width of destination image
620 // dst_addr: Destination address in main memory
621 // dst_vu: Counter which is incremented one by one
622 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
624 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
625 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
626 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
627 STR_BUF+curr_dst_idx, // Tag
630 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
631 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
632 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
633 STR_BUF+curr_dst_idx, // Tag
636 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
637 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
638 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
639 STR_BUF+curr_dst_idx, // Tag
641 //---------------------------------------------------------------------------------------------
644 // update for next cycle
645 curr_src_idx = next_src_idx;
646 curr_dst_idx = next_dst_idx;
648 curr_interpl_y_upper = next_interpl_y_upper;
649 curr_interpl_y_lower = next_interpl_y_lower;
650 curr_interpl_vu = next_interpl_vu;
652 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
653 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
654 vf_curr_NSweight_vu = vf_next_NSweight_vu;
656 curr_src_y_upper = next_src_y_upper;
657 curr_src_y_lower = next_src_y_lower;
658 curr_src_vu = next_src_vu;
660 curr_lsoff_v = next_lsoff_v;
661 curr_lsoff_u = next_lsoff_u;
666 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
670 bilinear_scale_line_w16( y_plane[curr_src_idx],
671 scaled_y_plane[curr_src_idx],
674 vf_curr_NSweight_y_upper,
677 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
678 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
681 vf_curr_NSweight_y_lower,
684 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
685 scaled_v_plane[curr_src_idx],
691 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
692 scaled_u_plane[curr_src_idx],
698 //---------------------------------------------------------------------------------------------
699 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
701 // Perform three DMA transfers to 3 different locations in the main memory!
702 // dst_width: Pixel width of destination image
703 // dst_addr: Destination address in main memory
704 // dst_vu: Counter which is incremented one by one
705 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
707 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
708 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
709 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
710 STR_BUF+curr_dst_idx, // Tag
713 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
714 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
715 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
716 STR_BUF+curr_dst_idx, // Tag
719 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
720 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
721 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
722 STR_BUF+curr_dst_idx, // Tag
725 // wait for completion
726 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
727 //---------------------------------------------------------------------------------------------
732 * scale_srcw32_dstw16()
734 * processes an input image of width 32
735 * scaling is done to a width 16
736 * yuv2rgb conversion on a width of 16
737 * result stored in RAM
739 void scale_srcw32_dstw16() {
740 // extract parameters
741 unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
743 unsigned int src_width = parms.src_pixel_width;
744 unsigned int src_height = parms.src_pixel_height;
745 unsigned int dst_width = parms.dst_pixel_width;
746 unsigned int dst_height = parms.dst_pixel_height;
749 unsigned int src_linestride_y = src_width;
750 unsigned int src_dbl_linestride_y = src_width<<1;
751 unsigned int src_linestride_vu = src_width>>1;
752 unsigned int src_dbl_linestride_vu = src_width;
754 unsigned int scaled_src_linestride_y = dst_width;
757 unsigned char* src_addr_y = parms.y_plane;
758 unsigned char* src_addr_v = parms.v_plane;
759 unsigned char* src_addr_u = parms.u_plane;
761 unsigned int dst_picture_size = dst_width*dst_height;
763 // Sizes for destination
764 unsigned int dst_dbl_linestride_y = dst_width<<1;
765 unsigned int dst_dbl_linestride_vu = dst_width>>1;
767 // Perform address calculation for Y, V and U in main memory with dst_addr as base
768 unsigned char* dst_addr_main_memory_y = dst_addr;
769 unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
770 unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
772 // calculate scale factors
773 vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
774 float y_scale = (float)src_height/(float)dst_height;
776 // double buffered processing
778 unsigned int curr_src_idx = 0;
779 unsigned int curr_dst_idx = 0;
780 unsigned int next_src_idx, next_dst_idx;
782 // 2 lines y as output, upper and lowerline
783 unsigned int curr_interpl_y_upper = 0;
784 unsigned int next_interpl_y_upper;
785 unsigned int curr_interpl_y_lower, next_interpl_y_lower;
786 // only 1 line v/u output, both planes have the same dimension
787 unsigned int curr_interpl_vu = 0;
788 unsigned int next_interpl_vu;
790 // weights, calculated in every loop iteration
791 vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
792 vector float vf_next_NSweight_y_upper;
793 vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
794 vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
795 vector float vf_next_NSweight_vu;
797 // line indices for the src picture
798 float curr_src_y_upper = 0.0f, next_src_y_upper;
799 float curr_src_y_lower, next_src_y_lower;
800 float curr_src_vu = 0.0f, next_src_vu;
802 // line indices for the dst picture
803 unsigned int dst_y=0, dst_vu=0;
805 // calculate lower line idices
806 curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
807 curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
809 vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
812 // start partially double buffered processing
813 // get initial data, 2 sets of y, 1 set v, 1 set u
814 mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
815 mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
816 (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
817 src_dbl_linestride_y,
820 mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
821 mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
824 // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
825 // the scaled output is 2 lines y, 1 line v, 1 line u
826 // the yuv2rgb-converted output is stored to RAM
827 for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
830 // calculate next indices
831 next_src_vu = ((float)dst_vu+1)*y_scale;
832 next_src_y_upper = ((float)dst_y+2)*y_scale;
833 next_src_y_lower = ((float)dst_y+3)*y_scale;
835 next_interpl_vu = (unsigned int) next_src_vu;
836 next_interpl_y_upper = (unsigned int) next_src_y_upper;
837 next_interpl_y_lower = (unsigned int) next_src_y_lower;
839 // calculate weight NORTH-SOUTH
840 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
841 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
842 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
845 next_src_idx = curr_src_idx^1;
846 next_dst_idx = curr_dst_idx^1;
849 mfc_get( y_plane[next_src_idx],
850 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
851 src_dbl_linestride_y,
852 RETR_BUF+next_src_idx,
854 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
855 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
856 src_dbl_linestride_y,
857 RETR_BUF+next_src_idx,
861 mfc_get( v_plane[next_src_idx],
862 (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
863 src_dbl_linestride_vu,
864 RETR_BUF+next_src_idx,
867 mfc_get( u_plane[next_src_idx],
868 (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
869 src_dbl_linestride_vu,
870 RETR_BUF+next_src_idx,
873 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
877 bilinear_scale_line_w16( y_plane[curr_src_idx],
878 scaled_y_plane[curr_src_idx],
881 vf_curr_NSweight_y_upper,
884 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
885 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
888 vf_curr_NSweight_y_lower,
891 bilinear_scale_line_w16( v_plane[curr_src_idx],
892 scaled_v_plane[curr_src_idx],
898 bilinear_scale_line_w16( u_plane[curr_src_idx],
899 scaled_u_plane[curr_src_idx],
905 //---------------------------------------------------------------------------------------------
906 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
908 // Perform three DMA transfers to 3 different locations in the main memory!
909 // dst_width: Pixel width of destination image
910 // dst_addr: Destination address in main memory
911 // dst_vu: Counter which is incremented one by one
912 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
914 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
915 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
916 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
917 STR_BUF+curr_dst_idx, // Tag
920 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
921 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
922 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
923 STR_BUF+curr_dst_idx, // Tag
926 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
927 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
928 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
929 STR_BUF+curr_dst_idx, // Tag
931 //---------------------------------------------------------------------------------------------
934 // update for next cycle
935 curr_src_idx = next_src_idx;
936 curr_dst_idx = next_dst_idx;
938 curr_interpl_y_upper = next_interpl_y_upper;
939 curr_interpl_y_lower = next_interpl_y_lower;
940 curr_interpl_vu = next_interpl_vu;
942 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
943 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
944 vf_curr_NSweight_vu = vf_next_NSweight_vu;
946 curr_src_y_upper = next_src_y_upper;
947 curr_src_y_lower = next_src_y_lower;
948 curr_src_vu = next_src_vu;
953 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
957 bilinear_scale_line_w16( y_plane[curr_src_idx],
958 scaled_y_plane[curr_src_idx],
961 vf_curr_NSweight_y_upper,
964 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
965 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
968 vf_curr_NSweight_y_lower,
971 bilinear_scale_line_w16( v_plane[curr_src_idx],
972 scaled_v_plane[curr_src_idx],
978 bilinear_scale_line_w16( u_plane[curr_src_idx],
979 scaled_u_plane[curr_src_idx],
986 //---------------------------------------------------------------------------------------------
987 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
989 // Perform three DMA transfers to 3 different locations in the main memory!
990 // dst_width: Pixel width of destination image
991 // dst_addr: Destination address in main memory
992 // dst_vu: Counter which is incremented one by one
993 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
995 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
996 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
997 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
998 STR_BUF+curr_dst_idx, // Tag
1001 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
1002 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1003 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
1004 STR_BUF+curr_dst_idx, // Tag
1007 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
1008 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1009 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
1010 STR_BUF+curr_dst_idx, // Tag
1013 // wait for completion
1014 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1015 //---------------------------------------------------------------------------------------------
1020 * scale_srcw32_dstw32()
1022 * processes an input image of width 32
1023 * scaling is done to a width 32
1024 * yuv2rgb conversion on a width of 32
1025 * result stored in RAM
1027 void scale_srcw32_dstw32() {
1028 // extract parameters
1029 unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
1031 unsigned int src_width = parms.src_pixel_width;
1032 unsigned int src_height = parms.src_pixel_height;
1033 unsigned int dst_width = parms.dst_pixel_width;
1034 unsigned int dst_height = parms.dst_pixel_height;
1037 unsigned int src_linestride_y = src_width;
1038 unsigned int src_dbl_linestride_y = src_width<<1;
1039 unsigned int src_linestride_vu = src_width>>1;
1040 unsigned int src_dbl_linestride_vu = src_width;
1043 unsigned int scaled_src_linestride_y = dst_width;
1046 unsigned char* src_addr_y = parms.y_plane;
1047 unsigned char* src_addr_v = parms.v_plane;
1048 unsigned char* src_addr_u = parms.u_plane;
1050 unsigned int dst_picture_size = dst_width*dst_height;
1052 // Sizes for destination
1053 unsigned int dst_dbl_linestride_y = dst_width<<1;
1054 unsigned int dst_dbl_linestride_vu = dst_width>>1;
1056 // Perform address calculation for Y, V and U in main memory with dst_addr as base
1057 unsigned char* dst_addr_main_memory_y = dst_addr;
1058 unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
1059 unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
1061 // calculate scale factors
1062 vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
1063 float y_scale = (float)src_height/(float)dst_height;
1065 // double buffered processing
1067 unsigned int curr_src_idx = 0;
1068 unsigned int curr_dst_idx = 0;
1069 unsigned int next_src_idx, next_dst_idx;
1071 // 2 lines y as output, upper and lowerline
1072 unsigned int curr_interpl_y_upper = 0;
1073 unsigned int next_interpl_y_upper;
1074 unsigned int curr_interpl_y_lower, next_interpl_y_lower;
1075 // only 1 line v/u output, both planes have the same dimension
1076 unsigned int curr_interpl_vu = 0;
1077 unsigned int next_interpl_vu;
1079 // weights, calculated in every loop iteration
1080 vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
1081 vector float vf_next_NSweight_y_upper;
1082 vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
1083 vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
1084 vector float vf_next_NSweight_vu;
1086 // line indices for the src picture
1087 float curr_src_y_upper = 0.0f, next_src_y_upper;
1088 float curr_src_y_lower, next_src_y_lower;
1089 float curr_src_vu = 0.0f, next_src_vu;
1091 // line indices for the dst picture
1092 unsigned int dst_y=0, dst_vu=0;
1094 // calculate lower line idices
1095 curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
1096 curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
1097 // lower line weight
1098 vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
1101 // start partially double buffered processing
1102 // get initial data, 2 sets of y, 1 set v, 1 set u
1103 mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
1104 mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
1105 (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
1106 src_dbl_linestride_y,
1109 mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
1110 mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
1113 // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
1114 // the scaled output is 2 lines y, 1 line v, 1 line u
1115 // the yuv2rgb-converted output is stored to RAM
1116 for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
1119 // calculate next indices
1120 next_src_vu = ((float)dst_vu+1)*y_scale;
1121 next_src_y_upper = ((float)dst_y+2)*y_scale;
1122 next_src_y_lower = ((float)dst_y+3)*y_scale;
1124 next_interpl_vu = (unsigned int) next_src_vu;
1125 next_interpl_y_upper = (unsigned int) next_src_y_upper;
1126 next_interpl_y_lower = (unsigned int) next_src_y_lower;
1128 // calculate weight NORTH-SOUTH
1129 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
1130 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
1131 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
1134 next_src_idx = curr_src_idx^1;
1135 next_dst_idx = curr_dst_idx^1;
1138 mfc_get( y_plane[next_src_idx],
1139 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
1140 src_dbl_linestride_y,
1141 RETR_BUF+next_src_idx,
1143 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
1144 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
1145 src_dbl_linestride_y,
1146 RETR_BUF+next_src_idx,
1150 mfc_get( v_plane[next_src_idx],
1151 (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
1152 src_dbl_linestride_vu,
1153 RETR_BUF+next_src_idx,
1156 mfc_get( u_plane[next_src_idx],
1157 (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
1158 src_dbl_linestride_vu,
1159 RETR_BUF+next_src_idx,
1162 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
1165 // work line y_upper
1166 bilinear_scale_line_w16( y_plane[curr_src_idx],
1167 scaled_y_plane[curr_src_idx],
1170 vf_curr_NSweight_y_upper,
1172 // work line y_lower
1173 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
1174 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
1177 vf_curr_NSweight_y_lower,
1180 bilinear_scale_line_w16( v_plane[curr_src_idx],
1181 scaled_v_plane[curr_src_idx],
1184 vf_curr_NSweight_vu,
1185 src_linestride_vu );
1187 bilinear_scale_line_w16( u_plane[curr_src_idx],
1188 scaled_u_plane[curr_src_idx],
1191 vf_curr_NSweight_vu,
1192 src_linestride_vu );
1196 // Store the result back to main memory into a destination buffer in YUV format
1197 //---------------------------------------------------------------------------------------------
1198 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1200 // Perform three DMA transfers to 3 different locations in the main memory!
1201 // dst_width: Pixel width of destination image
1202 // dst_addr: Destination address in main memory
1203 // dst_vu: Counter which is incremented one by one
1204 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
1206 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
1207 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
1208 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
1209 STR_BUF+curr_dst_idx, // Tag
1212 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
1213 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1214 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
1215 STR_BUF+curr_dst_idx, // Tag
1218 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
1219 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1220 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
1221 STR_BUF+curr_dst_idx, // Tag
1223 //---------------------------------------------------------------------------------------------
1226 // update for next cycle
1227 curr_src_idx = next_src_idx;
1228 curr_dst_idx = next_dst_idx;
1230 curr_interpl_y_upper = next_interpl_y_upper;
1231 curr_interpl_y_lower = next_interpl_y_lower;
1232 curr_interpl_vu = next_interpl_vu;
1234 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
1235 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
1236 vf_curr_NSweight_vu = vf_next_NSweight_vu;
1238 curr_src_y_upper = next_src_y_upper;
1239 curr_src_y_lower = next_src_y_lower;
1240 curr_src_vu = next_src_vu;
1245 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
1248 // work line y_upper
1249 bilinear_scale_line_w16( y_plane[curr_src_idx],
1250 scaled_y_plane[curr_src_idx],
1253 vf_curr_NSweight_y_upper,
1255 // work line y_lower
1256 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
1257 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
1260 vf_curr_NSweight_y_lower,
1263 bilinear_scale_line_w16( v_plane[curr_src_idx],
1264 scaled_v_plane[curr_src_idx],
1267 vf_curr_NSweight_vu,
1268 src_linestride_vu );
1270 bilinear_scale_line_w16( u_plane[curr_src_idx],
1271 scaled_u_plane[curr_src_idx],
1274 vf_curr_NSweight_vu,
1275 src_linestride_vu );
1278 // Store the result back to main memory into a destination buffer in YUV format
1279 //---------------------------------------------------------------------------------------------
1280 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1282 // Perform three DMA transfers to 3 different locations in the main memory!
1283 // dst_width: Pixel width of destination image
1284 // dst_addr: Destination address in main memory
1285 // dst_vu: Counter which is incremented one by one
1286 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
1288 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
1289 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
1290 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
1291 STR_BUF+curr_dst_idx, // Tag
1294 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
1295 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1296 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
1297 STR_BUF+curr_dst_idx, // Tag
1300 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
1301 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1302 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
1303 STR_BUF+curr_dst_idx, // Tag
1306 // wait for completion
1307 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1308 //---------------------------------------------------------------------------------------------
1313 * bilinear_scale_line_w8()
1315 * processes a line of yuv-input, width has to be a multiple of 8
1316 * scaled yuv-output is written to local store buffer
1318 * @param src buffer for 2 lines input
1319 * @param dst_ buffer for 1 line output
1320 * @param dst_width the width of the destination line
1321 * @param vf_x_scale a float vector, at each entry is the x_scale-factor
1322 * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
1323 * @param src_linestride the stride of the srcline
1325 void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
1327 unsigned char* dst = dst_;
1330 for( dst_x=0; dst_x<dst_width; dst_x+=8) {
1331 // address calculation for loading the 4 surrounding pixel of each calculated
1332 // destination pixel
1333 vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
1334 // lower range->first 4 pixel
1335 // upper range->next 4 pixel
1336 vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
1337 vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
1338 vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
1339 vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
1341 // calculate weight EAST-WEST
1342 vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
1343 vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
1344 vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
1345 vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
1346 vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
1347 vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
1348 vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
1349 vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
1350 vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
1351 vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
1353 // calculate address offset
1356 vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
1357 vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
1359 // pixel NORTH EAST-->(offpixelNW+1)
1360 vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
1361 vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
1362 vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
1364 // SOUTH-WEST-->(offpixelNW+src_linestride)
1365 vector unsigned int vui_srclinestride = spu_splats( src_linestride );
1366 vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
1367 vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
1369 // SOUTH-EAST-->(offpixelNW+src_linestride+1)
1370 vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
1371 vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
1373 // calculate each address
1374 vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
1375 vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
1376 vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
1377 vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
1378 vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
1380 vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
1381 vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
1382 vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
1383 vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
1387 // scalar load, afterwards insertion into the right position
1389 vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
1390 vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
1391 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
1392 vuc_pixel_NW_lower_range = spu_insert(
1393 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
1394 vuc_pixel_NW_lower_range, 7 );
1395 vuc_pixel_NW_lower_range = spu_insert(
1396 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
1397 vuc_pixel_NW_lower_range, 11 );
1398 vuc_pixel_NW_lower_range = spu_insert(
1399 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
1400 vuc_pixel_NW_lower_range, 15 );
1402 vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
1403 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
1404 vuc_pixel_NW_upper_range = spu_insert(
1405 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
1406 vuc_pixel_NW_upper_range, 7 );
1407 vuc_pixel_NW_upper_range = spu_insert(
1408 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
1409 vuc_pixel_NW_upper_range, 11 );
1410 vuc_pixel_NW_upper_range = spu_insert(
1411 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
1412 vuc_pixel_NW_upper_range, 15 );
1415 vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
1416 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
1417 vuc_pixel_NE_lower_range = spu_insert(
1418 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
1419 vuc_pixel_NE_lower_range, 7 );
1420 vuc_pixel_NE_lower_range = spu_insert(
1421 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
1422 vuc_pixel_NE_lower_range, 11 );
1423 vuc_pixel_NE_lower_range = spu_insert(
1424 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
1425 vuc_pixel_NE_lower_range, 15 );
1427 vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
1428 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
1429 vuc_pixel_NE_upper_range = spu_insert(
1430 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
1431 vuc_pixel_NE_upper_range, 7 );
1432 vuc_pixel_NE_upper_range = spu_insert(
1433 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
1434 vuc_pixel_NE_upper_range, 11 );
1435 vuc_pixel_NE_upper_range = spu_insert(
1436 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
1437 vuc_pixel_NE_upper_range, 15 );
1441 vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
1442 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
1443 vuc_pixel_SW_lower_range = spu_insert(
1444 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
1445 vuc_pixel_SW_lower_range, 7 );
1446 vuc_pixel_SW_lower_range = spu_insert(
1447 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
1448 vuc_pixel_SW_lower_range, 11 );
1449 vuc_pixel_SW_lower_range = spu_insert(
1450 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
1451 vuc_pixel_SW_lower_range, 15 );
1453 vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
1454 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
1455 vuc_pixel_SW_upper_range = spu_insert(
1456 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
1457 vuc_pixel_SW_upper_range, 7 );
1458 vuc_pixel_SW_upper_range = spu_insert(
1459 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
1460 vuc_pixel_SW_upper_range, 11 );
1461 vuc_pixel_SW_upper_range = spu_insert(
1462 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
1463 vuc_pixel_SW_upper_range, 15 );
1466 vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
1467 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
1468 vuc_pixel_SE_lower_range = spu_insert(
1469 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
1470 vuc_pixel_SE_lower_range, 7 );
1471 vuc_pixel_SE_lower_range = spu_insert(
1472 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
1473 vuc_pixel_SE_lower_range, 11 );
1474 vuc_pixel_SE_lower_range = spu_insert(
1475 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
1476 vuc_pixel_SE_lower_range, 15 );
1478 vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
1479 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
1480 vuc_pixel_SE_upper_range = spu_insert(
1481 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
1482 vuc_pixel_SE_upper_range, 7 );
1483 vuc_pixel_SE_upper_range = spu_insert(
1484 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
1485 vuc_pixel_SE_upper_range, 11 );
1486 vuc_pixel_SE_upper_range = spu_insert(
1487 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
1488 vuc_pixel_SE_upper_range, 15 );
1492 vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
1493 vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
1495 vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
1496 vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
1498 vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
1499 vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
1501 vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
1502 vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
1506 // first linear interpolation: EWtop
1507 // EWtop = NW + EWweight*(NE-NW)
1510 vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
1511 vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
1512 vf_EWtop_lower_range_tmp,
1513 vf_pixel_NW_lower_range );
1516 vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
1517 vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
1518 vf_EWtop_upper_range_tmp,
1519 vf_pixel_NW_upper_range );
1523 // second linear interpolation: EWbottom
1524 // EWbottom = SW + EWweight*(SE-SW)
1527 vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
1528 vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
1529 vf_EWbottom_lower_range_tmp,
1530 vf_pixel_SW_lower_range );
1533 vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
1534 vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
1535 vf_EWbottom_upper_range_tmp,
1536 vf_pixel_SW_upper_range );
1540 // third linear interpolation: the bilinear interpolated value
1541 // result = EWtop + NSweight*(EWbottom-EWtop);
1544 vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
1545 vector float vf_result_lower_range = spu_madd( vf_NSweight,
1546 vf_result_lower_range_tmp,
1547 vf_EWtop_lower_range );
1550 vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
1551 vector float vf_result_upper_range = spu_madd( vf_NSweight,
1552 vf_result_upper_range_tmp,
1553 vf_EWtop_upper_range );
1556 // convert back: using saturated arithmetic
1557 vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
1558 vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
1560 // merge results->lower,upper
1561 vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
1562 0x13, 0x17, 0x1B, 0x1F,
1563 0x00, 0x00, 0x00, 0x00,
1564 0x00, 0x00, 0x00, 0x00 };
1566 vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
1567 (vector unsigned char) vui_result_upper_range,
1568 vuc_mask_merge_result );
1571 vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
1572 0x00, 0x00, 0x00, 0x00,
1573 0xFF, 0xFF, 0xFF, 0xFF,
1574 0xFF, 0xFF, 0xFF, 0xFF };
1577 // get currently stored data
1578 vector unsigned char vuc_orig = *((vector unsigned char*)dst);
1580 // clear currently stored data
1581 vuc_orig = spu_and( vuc_orig,
1582 spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
1584 // rotate result according to storing address
1585 vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
1588 *((vector unsigned char*)dst) = spu_or( vuc_result,
1596 * bilinear_scale_line_w16()
1598 * processes a line of yuv-input, width has to be a multiple of 16
1599 * scaled yuv-output is written to local store buffer
1601 * @param src buffer for 2 lines input
1602 * @param dst_ buffer for 1 line output
1603 * @param dst_width the width of the destination line
1604 * @param vf_x_scale a float vector, at each entry is the x_scale-factor
1605 * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
1606 * @param src_linestride the stride of the srcline
1608 void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
1610 unsigned char* dst = dst_;
1613 for( dst_x=0; dst_x<dst_width; dst_x+=16) {
1614 // address calculation for loading the 4 surrounding pixel of each calculated
1615 // destination pixel
1616 vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
1617 // parallelised processing
1618 // first range->pixel 1 2 3 4
1619 // second range->pixel 5 6 7 8
1620 // third range->pixel 9 10 11 12
1621 // fourth range->pixel 13 14 15 16
1622 vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
1623 vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
1624 vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
1625 vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
1626 vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
1627 vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
1628 vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
1629 vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
1631 // calculate weight EAST-WEST
1632 vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
1633 vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
1634 vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
1635 vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
1636 vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
1637 vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
1638 vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
1639 vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
1640 vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
1641 vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
1642 vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
1643 vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
1644 vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
1645 vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
1646 vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
1647 vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
1648 vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
1649 vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
1650 vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
1651 vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
1653 // calculate address offset
1656 vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
1657 vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
1658 vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
1659 vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
1661 // pixel NORTH EAST-->(offpixelNW+1)
1662 vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
1663 vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
1664 vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
1665 vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
1666 vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
1668 // SOUTH-WEST-->(offpixelNW+src_linestride)
1669 vector unsigned int vui_srclinestride = spu_splats( src_linestride );
1670 vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
1671 vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
1672 vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
1673 vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
1675 // SOUTH-EAST-->(offpixelNW+src_linestride+1)
1676 vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
1677 vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
1678 vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
1679 vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
1681 // calculate each address
1682 vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
1683 vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
1684 vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
1685 vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
1686 vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
1688 vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
1689 vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
1690 vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
1691 vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
1693 vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
1694 vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
1695 vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
1696 vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
1698 vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
1699 vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
1700 vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
1701 vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
1706 // scalar load, afterwards insertion into the right position
1709 vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
1710 vector unsigned char vuc_pixel_NW_first_range = spu_insert(
1711 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
1712 vuc_pixel_NW_first_range = spu_insert(
1713 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
1714 vuc_pixel_NW_first_range, 7 );
1715 vuc_pixel_NW_first_range = spu_insert(
1716 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
1717 vuc_pixel_NW_first_range, 11 );
1718 vuc_pixel_NW_first_range = spu_insert(
1719 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
1720 vuc_pixel_NW_first_range, 15 );
1722 vector unsigned char vuc_pixel_NW_second_range = spu_insert(
1723 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
1724 vuc_pixel_NW_second_range = spu_insert(
1725 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
1726 vuc_pixel_NW_second_range, 7 );
1727 vuc_pixel_NW_second_range = spu_insert(
1728 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
1729 vuc_pixel_NW_second_range, 11 );
1730 vuc_pixel_NW_second_range = spu_insert(
1731 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
1732 vuc_pixel_NW_second_range, 15 );
1734 vector unsigned char vuc_pixel_NW_third_range = spu_insert(
1735 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
1736 vuc_pixel_NW_third_range = spu_insert(
1737 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
1738 vuc_pixel_NW_third_range, 7 );
1739 vuc_pixel_NW_third_range = spu_insert(
1740 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
1741 vuc_pixel_NW_third_range, 11 );
1742 vuc_pixel_NW_third_range = spu_insert(
1743 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
1744 vuc_pixel_NW_third_range, 15 );
1746 vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
1747 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
1748 vuc_pixel_NW_fourth_range = spu_insert(
1749 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
1750 vuc_pixel_NW_fourth_range, 7 );
1751 vuc_pixel_NW_fourth_range = spu_insert(
1752 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
1753 vuc_pixel_NW_fourth_range, 11 );
1754 vuc_pixel_NW_fourth_range = spu_insert(
1755 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
1756 vuc_pixel_NW_fourth_range, 15 );
1760 vector unsigned char vuc_pixel_NE_first_range = spu_insert(
1761 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
1762 vuc_pixel_NE_first_range = spu_insert(
1763 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
1764 vuc_pixel_NE_first_range, 7 );
1765 vuc_pixel_NE_first_range = spu_insert(
1766 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
1767 vuc_pixel_NE_first_range, 11 );
1768 vuc_pixel_NE_first_range = spu_insert(
1769 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
1770 vuc_pixel_NE_first_range, 15 );
1772 vector unsigned char vuc_pixel_NE_second_range = spu_insert(
1773 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
1774 vuc_pixel_NE_second_range = spu_insert(
1775 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
1776 vuc_pixel_NE_second_range, 7 );
1777 vuc_pixel_NE_second_range = spu_insert(
1778 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
1779 vuc_pixel_NE_second_range, 11 );
1780 vuc_pixel_NE_second_range = spu_insert(
1781 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
1782 vuc_pixel_NE_second_range, 15 );
1784 vector unsigned char vuc_pixel_NE_third_range = spu_insert(
1785 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
1786 vuc_pixel_NE_third_range = spu_insert(
1787 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
1788 vuc_pixel_NE_third_range, 7 );
1789 vuc_pixel_NE_third_range = spu_insert(
1790 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
1791 vuc_pixel_NE_third_range, 11 );
1792 vuc_pixel_NE_third_range = spu_insert(
1793 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
1794 vuc_pixel_NE_third_range, 15 );
1796 vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
1797 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
1798 vuc_pixel_NE_fourth_range = spu_insert(
1799 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
1800 vuc_pixel_NE_fourth_range, 7 );
1801 vuc_pixel_NE_fourth_range = spu_insert(
1802 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
1803 vuc_pixel_NE_fourth_range, 11 );
1804 vuc_pixel_NE_fourth_range = spu_insert(
1805 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
1806 vuc_pixel_NE_fourth_range, 15 );
1810 vector unsigned char vuc_pixel_SW_first_range = spu_insert(
1811 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
1812 vuc_pixel_SW_first_range = spu_insert(
1813 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
1814 vuc_pixel_SW_first_range, 7 );
1815 vuc_pixel_SW_first_range = spu_insert(
1816 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
1817 vuc_pixel_SW_first_range, 11 );
1818 vuc_pixel_SW_first_range = spu_insert(
1819 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
1820 vuc_pixel_SW_first_range, 15 );
1822 vector unsigned char vuc_pixel_SW_second_range = spu_insert(
1823 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
1824 vuc_pixel_SW_second_range = spu_insert(
1825 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
1826 vuc_pixel_SW_second_range, 7 );
1827 vuc_pixel_SW_second_range = spu_insert(
1828 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
1829 vuc_pixel_SW_second_range, 11 );
1830 vuc_pixel_SW_second_range = spu_insert(
1831 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
1832 vuc_pixel_SW_second_range, 15 );
1834 vector unsigned char vuc_pixel_SW_third_range = spu_insert(
1835 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
1836 vuc_pixel_SW_third_range = spu_insert(
1837 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
1838 vuc_pixel_SW_third_range, 7 );
1839 vuc_pixel_SW_third_range = spu_insert(
1840 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
1841 vuc_pixel_SW_third_range, 11 );
1842 vuc_pixel_SW_third_range = spu_insert(
1843 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
1844 vuc_pixel_SW_third_range, 15 );
1846 vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
1847 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
1848 vuc_pixel_SW_fourth_range = spu_insert(
1849 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
1850 vuc_pixel_SW_fourth_range, 7 );
1851 vuc_pixel_SW_fourth_range = spu_insert(
1852 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
1853 vuc_pixel_SW_fourth_range, 11 );
1854 vuc_pixel_SW_fourth_range = spu_insert(
1855 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
1856 vuc_pixel_SW_fourth_range, 15 );
1860 vector unsigned char vuc_pixel_SE_first_range = spu_insert(
1861 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
1862 vuc_pixel_SE_first_range = spu_insert(
1863 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
1864 vuc_pixel_SE_first_range, 7 );
1865 vuc_pixel_SE_first_range = spu_insert(
1866 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
1867 vuc_pixel_SE_first_range, 11 );
1868 vuc_pixel_SE_first_range = spu_insert(
1869 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
1870 vuc_pixel_SE_first_range, 15 );
1872 vector unsigned char vuc_pixel_SE_second_range = spu_insert(
1873 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
1874 vuc_pixel_SE_second_range = spu_insert(
1875 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
1876 vuc_pixel_SE_second_range, 7 );
1877 vuc_pixel_SE_second_range = spu_insert(
1878 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
1879 vuc_pixel_SE_second_range, 11 );
1880 vuc_pixel_SE_second_range = spu_insert(
1881 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
1882 vuc_pixel_SE_second_range, 15 );
1884 vector unsigned char vuc_pixel_SE_third_range = spu_insert(
1885 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
1886 vuc_pixel_SE_third_range = spu_insert(
1887 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
1888 vuc_pixel_SE_third_range, 7 );
1889 vuc_pixel_SE_third_range = spu_insert(
1890 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
1891 vuc_pixel_SE_third_range, 11 );
1892 vuc_pixel_SE_third_range = spu_insert(
1893 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
1894 vuc_pixel_SE_third_range, 15 );
1896 vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
1897 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
1898 vuc_pixel_SE_fourth_range = spu_insert(
1899 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
1900 vuc_pixel_SE_fourth_range, 7 );
1901 vuc_pixel_SE_fourth_range = spu_insert(
1902 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
1903 vuc_pixel_SE_fourth_range, 11 );
1904 vuc_pixel_SE_fourth_range = spu_insert(
1905 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
1906 vuc_pixel_SE_fourth_range, 15 );
1911 vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
1912 vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
1913 vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
1914 vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
1916 vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
1917 vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
1918 vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
1919 vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
1921 vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
1922 vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
1923 vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
1924 vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
1926 vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
1927 vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
1928 vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
1929 vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
1931 // first linear interpolation: EWtop
1932 // EWtop = NW + EWweight*(NE-NW)
1935 vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
1936 vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
1937 vf_EWtop_first_range_tmp,
1938 vf_pixel_NW_first_range );
1941 vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
1942 vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
1943 vf_EWtop_second_range_tmp,
1944 vf_pixel_NW_second_range );
1947 vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
1948 vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
1949 vf_EWtop_third_range_tmp,
1950 vf_pixel_NW_third_range );
1953 vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
1954 vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
1955 vf_EWtop_fourth_range_tmp,
1956 vf_pixel_NW_fourth_range );
1960 // second linear interpolation: EWbottom
1961 // EWbottom = SW + EWweight*(SE-SW)
1964 vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
1965 vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
1966 vf_EWbottom_first_range_tmp,
1967 vf_pixel_SW_first_range );
1970 vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
1971 vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
1972 vf_EWbottom_second_range_tmp,
1973 vf_pixel_SW_second_range );
1975 vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
1976 vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
1977 vf_EWbottom_third_range_tmp,
1978 vf_pixel_SW_third_range );
1981 vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
1982 vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
1983 vf_EWbottom_fourth_range_tmp,
1984 vf_pixel_SW_fourth_range );
1988 // third linear interpolation: the bilinear interpolated value
1989 // result = EWtop + NSweight*(EWbottom-EWtop);
1992 vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
1993 vector float vf_result_first_range = spu_madd( vf_NSweight,
1994 vf_result_first_range_tmp,
1995 vf_EWtop_first_range );
1998 vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
1999 vector float vf_result_second_range = spu_madd( vf_NSweight,
2000 vf_result_second_range_tmp,
2001 vf_EWtop_second_range );
2004 vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
2005 vector float vf_result_third_range = spu_madd( vf_NSweight,
2006 vf_result_third_range_tmp,
2007 vf_EWtop_third_range );
2010 vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
2011 vector float vf_result_fourth_range = spu_madd( vf_NSweight,
2012 vf_result_fourth_range_tmp,
2013 vf_EWtop_fourth_range );
2017 // convert back: using saturated arithmetic
2018 vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
2019 vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
2020 vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
2021 vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
2023 // merge results->lower,upper
2024 vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
2025 0x13, 0x17, 0x1B, 0x1F,
2026 0x00, 0x00, 0x00, 0x00,
2027 0x00, 0x00, 0x00, 0x00 };
2029 vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
2030 0x00, 0x00, 0x00, 0x00,
2031 0x03, 0x07, 0x0B, 0x0F,
2032 0x13, 0x17, 0x1B, 0x1F };
2034 vector unsigned char vuc_result_first_second =
2035 spu_shuffle( (vector unsigned char) vui_result_first_range,
2036 (vector unsigned char) vui_result_second_range,
2037 vuc_mask_merge_result_first_second );
2039 vector unsigned char vuc_result_third_fourth =
2040 spu_shuffle( (vector unsigned char) vui_result_third_range,
2041 (vector unsigned char) vui_result_fourth_range,
2042 vuc_mask_merge_result_third_fourth );
2045 *((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
2046 vuc_result_third_fourth );