SDL-1.2.14
[sdl_omap.git] / src / video / ps3 / spulibs / bilin_scaler.c
CommitLineData
e14743d1 1/*
2 * SDL - Simple DirectMedia Layer
3 * CELL BE Support for PS3 Framebuffer
4 * Copyright (C) 2008, 2009 International Business Machines Corporation
5 *
6 * This library is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU Lesser General Public License as published
8 * by the Free Software Foundation; either version 2.1 of the License, or
9 * (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
20 *
21 * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com>
22 * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
23 * SPE code based on research by:
24 * Rene Becker
25 * Thimo Emmerich
26 */
27
28#include "spu_common.h"
29
30#include <spu_intrinsics.h>
31#include <spu_mfcio.h>
32
33// Debugging
34//#define DEBUG
35
36#ifdef DEBUG
37#define deprintf(fmt, args... ) \
38 fprintf( stdout, fmt, ##args ); \
39 fflush( stdout );
40#else
41#define deprintf( fmt, args... )
42#endif
43
44struct scale_parms_t parms __attribute__((aligned(128)));
45
46/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
47 * there might be the need to retrieve misaligned data, adjust
48 * incoming v and u plane to be able to handle this (add 128)
49 */
50unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
51unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
52unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
53
54/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
55unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
56unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
57unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
58
59/* some vectors needed by the float to int conversion */
60static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
61static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
62
63void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
64void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
65
66void scale_srcw16_dstw16();
67void scale_srcw16_dstw32();
68void scale_srcw32_dstw16();
69void scale_srcw32_dstw32();
70
71int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
72{
73 deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
74 /* DMA transfer for the input parameters */
75 spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
76 DMA_WAIT_TAG(TAG_INIT);
77
78 deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
79 parms.dst_pixel_width, parms.dst_pixel_height);
80
81 if(parms.src_pixel_width & 0x1f) {
82 if(parms.dst_pixel_width & 0x1F) {
83 deprintf("[SPU] Using scale_srcw16_dstw16\n");
84 scale_srcw16_dstw16();
85 } else {
86 deprintf("[SPU] Using scale_srcw16_dstw32\n");
87 scale_srcw16_dstw32();
88 }
89 } else {
90 if(parms.dst_pixel_width & 0x1F) {
91 deprintf("[SPU] Using scale_srcw32_dstw16\n");
92 scale_srcw32_dstw16();
93 } else {
94 deprintf("[SPU] Using scale_srcw32_dstw32\n");
95 scale_srcw32_dstw32();
96 }
97 }
98 deprintf("[SPU] bilin_scaler_spu... done!\n");
99
100 return 0;
101}
102
103
104/*
105 * vfloat_to_vuint()
106 *
107 * converts a float vector to an unsinged int vector using saturated
108 * arithmetic
109 *
110 * @param vec_s float vector for conversion
111 * @returns converted unsigned int vector
112 */
113inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
114 vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
115 vec_s = spu_sel(vec_s, vec_0_1, select_1);
116
117 vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
118 vec_s = spu_sel(vec_s, vec_255, select_2);
119 return spu_convtu(vec_s,0);
120}
121
122
123/*
124 * scale_srcw16_dstw16()
125 *
126 * processes an input image of width 16
127 * scaling is done to a width 16
128 * result stored in RAM
129 */
130void scale_srcw16_dstw16() {
131 // extract parameters
132 unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
133
134 unsigned int src_width = parms.src_pixel_width;
135 unsigned int src_height = parms.src_pixel_height;
136 unsigned int dst_width = parms.dst_pixel_width;
137 unsigned int dst_height = parms.dst_pixel_height;
138
139 // YVU
140 unsigned int src_linestride_y = src_width;
141 unsigned int src_dbl_linestride_y = src_width<<1;
142 unsigned int src_linestride_vu = src_width>>1;
143 unsigned int src_dbl_linestride_vu = src_width;
144
145 // scaled YVU
146 unsigned int scaled_src_linestride_y = dst_width;
147
148 // ram addresses
149 unsigned char* src_addr_y = parms.y_plane;
150 unsigned char* src_addr_v = parms.v_plane;
151 unsigned char* src_addr_u = parms.u_plane;
152
153 // for handling misalignment, addresses are precalculated
154 unsigned char* precalc_src_addr_v = src_addr_v;
155 unsigned char* precalc_src_addr_u = src_addr_u;
156
157 unsigned int dst_picture_size = dst_width*dst_height;
158
159 // Sizes for destination
160 unsigned int dst_dbl_linestride_y = dst_width<<1;
161 unsigned int dst_dbl_linestride_vu = dst_width>>1;
162
163 // Perform address calculation for Y, V and U in main memory with dst_addr as base
164 unsigned char* dst_addr_main_memory_y = dst_addr;
165 unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
166 unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
167
168 // calculate scale factors
169 vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
170 float y_scale = (float)src_height/(float)dst_height;
171
172 // double buffered processing
173 // buffer switching
174 unsigned int curr_src_idx = 0;
175 unsigned int curr_dst_idx = 0;
176 unsigned int next_src_idx, next_dst_idx;
177
178 // 2 lines y as output, upper and lowerline
179 unsigned int curr_interpl_y_upper = 0;
180 unsigned int next_interpl_y_upper;
181 unsigned int curr_interpl_y_lower, next_interpl_y_lower;
182 // only 1 line v/u output, both planes have the same dimension
183 unsigned int curr_interpl_vu = 0;
184 unsigned int next_interpl_vu;
185
186 // weights, calculated in every loop iteration
187 vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
188 vector float vf_next_NSweight_y_upper;
189 vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
190 vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
191 vector float vf_next_NSweight_vu;
192
193 // line indices for the src picture
194 float curr_src_y_upper = 0.0f, next_src_y_upper;
195 float curr_src_y_lower, next_src_y_lower;
196 float curr_src_vu = 0.0f, next_src_vu;
197
198 // line indices for the dst picture
199 unsigned int dst_y=0, dst_vu=0;
200
201 // offset for the v and u plane to handle misalignement
202 unsigned int curr_lsoff_v = 0, next_lsoff_v;
203 unsigned int curr_lsoff_u = 0, next_lsoff_u;
204
205 // calculate lower line indices
206 curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
207 curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
208 // lower line weight
209 vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
210
211
212 // start partially double buffered processing
213 // get initial data, 2 sets of y, 1 set v, 1 set u
214 mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
215 mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
216 (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
217 src_dbl_linestride_y,
218 RETR_BUF,
219 0, 0 );
220 mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
221 mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
222
223 /* iteration loop
224 * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
225 * the scaled output is 2 lines y, 1 line v, 1 line u
226 * the yuv2rgb-converted output is stored to RAM
227 */
228 for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
229 dst_y = dst_vu<<1;
230
231 // calculate next indices
232 next_src_vu = ((float)dst_vu+1)*y_scale;
233 next_src_y_upper = ((float)dst_y+2)*y_scale;
234 next_src_y_lower = ((float)dst_y+3)*y_scale;
235
236 next_interpl_vu = (unsigned int) next_src_vu;
237 next_interpl_y_upper = (unsigned int) next_src_y_upper;
238 next_interpl_y_lower = (unsigned int) next_src_y_lower;
239
240 // calculate weight NORTH-SOUTH
241 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
242 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
243 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
244
245 // get next lines
246 next_src_idx = curr_src_idx^1;
247 next_dst_idx = curr_dst_idx^1;
248
249 // 4 lines y
250 mfc_get( y_plane[next_src_idx],
251 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
252 src_dbl_linestride_y,
253 RETR_BUF+next_src_idx,
254 0, 0 );
255 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
256 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
257 src_dbl_linestride_y,
258 RETR_BUF+next_src_idx,
259 0, 0 );
260
261 // 2 lines v
262 precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
263 next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
264 mfc_get( v_plane[next_src_idx],
265 ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
266 src_dbl_linestride_vu+(next_lsoff_v<<1),
267 RETR_BUF+next_src_idx,
268 0, 0 );
269 // 2 lines u
270 precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
271 next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
272 mfc_get( u_plane[next_src_idx],
273 ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
274 src_dbl_linestride_vu+(next_lsoff_v<<1),
275 RETR_BUF+next_src_idx,
276 0, 0 );
277
278 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
279
280 // scaling
281 // work line y_upper
282 bilinear_scale_line_w16( y_plane[curr_src_idx],
283 scaled_y_plane[curr_src_idx],
284 dst_width,
285 vf_x_scale,
286 vf_curr_NSweight_y_upper,
287 src_linestride_y );
288 // work line y_lower
289 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
290 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
291 dst_width,
292 vf_x_scale,
293 vf_curr_NSweight_y_lower,
294 src_linestride_y );
295 // work line v
296 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
297 scaled_v_plane[curr_src_idx],
298 dst_width>>1,
299 vf_x_scale,
300 vf_curr_NSweight_vu,
301 src_linestride_vu );
302 // work line u
303 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
304 scaled_u_plane[curr_src_idx],
305 dst_width>>1,
306 vf_x_scale,
307 vf_curr_NSweight_vu,
308 src_linestride_vu );
309
310
311 // Store the result back to main memory into a destination buffer in YUV format
312 //---------------------------------------------------------------------------------------------
313 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
314
315 // Perform three DMA transfers to 3 different locations in the main memory!
316 // dst_width: Pixel width of destination image
317 // dst_addr: Destination address in main memory
318 // dst_vu: Counter which is incremented one by one
319 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
320 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
321 (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
322 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
323 STR_BUF+curr_dst_idx, // Tag
324 0, 0 );
325
326 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
327 (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
328 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
329 STR_BUF+curr_dst_idx, // Tag
330 0, 0 );
331
332 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
333 (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
334 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
335 STR_BUF+curr_dst_idx, // Tag
336 0, 0 );
337 //---------------------------------------------------------------------------------------------
338
339
340 // update for next cycle
341 curr_src_idx = next_src_idx;
342 curr_dst_idx = next_dst_idx;
343
344 curr_interpl_y_upper = next_interpl_y_upper;
345 curr_interpl_y_lower = next_interpl_y_lower;
346 curr_interpl_vu = next_interpl_vu;
347
348 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
349 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
350 vf_curr_NSweight_vu = vf_next_NSweight_vu;
351
352 curr_src_y_upper = next_src_y_upper;
353 curr_src_y_lower = next_src_y_lower;
354 curr_src_vu = next_src_vu;
355
356 curr_lsoff_v = next_lsoff_v;
357 curr_lsoff_u = next_lsoff_u;
358 }
359
360
361
362 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
363
364 // scaling
365 // work line y_upper
366 bilinear_scale_line_w16( y_plane[curr_src_idx],
367 scaled_y_plane[curr_src_idx],
368 dst_width,
369 vf_x_scale,
370 vf_curr_NSweight_y_upper,
371 src_linestride_y );
372 // work line y_lower
373 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
374 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
375 dst_width,
376 vf_x_scale,
377 vf_curr_NSweight_y_lower,
378 src_linestride_y );
379 // work line v
380 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
381 scaled_v_plane[curr_src_idx],
382 dst_width>>1,
383 vf_x_scale,
384 vf_curr_NSweight_vu,
385 src_linestride_vu );
386 // work line u
387 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
388 scaled_u_plane[curr_src_idx],
389 dst_width>>1,
390 vf_x_scale,
391 vf_curr_NSweight_vu,
392 src_linestride_vu );
393
394
395 // Store the result back to main memory into a destination buffer in YUV format
396 //---------------------------------------------------------------------------------------------
397 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
398
399 // Perform three DMA transfers to 3 different locations in the main memory!
400 // dst_width: Pixel width of destination image
401 // dst_addr: Destination address in main memory
402 // dst_vu: Counter which is incremented one by one
403 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
404 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
405 (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
406 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
407 STR_BUF+curr_dst_idx, // Tag
408 0, 0 );
409
410 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
411 (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
412 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
413 STR_BUF+curr_dst_idx, // Tag
414 0, 0 );
415
416 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
417 (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
418 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
419 STR_BUF+curr_dst_idx, // Tag
420 0, 0 );
421
422 // wait for completion
423 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
424 //---------------------------------------------------------------------------------------------
425}
426
427
428/*
429 * scale_srcw16_dstw32()
430 *
431 * processes an input image of width 16
432 * scaling is done to a width 32
433 * yuv2rgb conversion on a width of 32
434 * result stored in RAM
435 */
436void scale_srcw16_dstw32() {
437 // extract parameters
438 unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
439
440 unsigned int src_width = parms.src_pixel_width;
441 unsigned int src_height = parms.src_pixel_height;
442 unsigned int dst_width = parms.dst_pixel_width;
443 unsigned int dst_height = parms.dst_pixel_height;
444
445 // YVU
446 unsigned int src_linestride_y = src_width;
447 unsigned int src_dbl_linestride_y = src_width<<1;
448 unsigned int src_linestride_vu = src_width>>1;
449 unsigned int src_dbl_linestride_vu = src_width;
450 // scaled YVU
451 unsigned int scaled_src_linestride_y = dst_width;
452
453 // ram addresses
454 unsigned char* src_addr_y = parms.y_plane;
455 unsigned char* src_addr_v = parms.v_plane;
456 unsigned char* src_addr_u = parms.u_plane;
457
458 unsigned int dst_picture_size = dst_width*dst_height;
459
460 // Sizes for destination
461 unsigned int dst_dbl_linestride_y = dst_width<<1;
462 unsigned int dst_dbl_linestride_vu = dst_width>>1;
463
464 // Perform address calculation for Y, V and U in main memory with dst_addr as base
465 unsigned char* dst_addr_main_memory_y = dst_addr;
466 unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
467 unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
468
469
470 // for handling misalignment, addresses are precalculated
471 unsigned char* precalc_src_addr_v = src_addr_v;
472 unsigned char* precalc_src_addr_u = src_addr_u;
473
474 // calculate scale factors
475 vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
476 float y_scale = (float)src_height/(float)dst_height;
477
478 // double buffered processing
479 // buffer switching
480 unsigned int curr_src_idx = 0;
481 unsigned int curr_dst_idx = 0;
482 unsigned int next_src_idx, next_dst_idx;
483
484 // 2 lines y as output, upper and lowerline
485 unsigned int curr_interpl_y_upper = 0;
486 unsigned int next_interpl_y_upper;
487 unsigned int curr_interpl_y_lower, next_interpl_y_lower;
488 // only 1 line v/u output, both planes have the same dimension
489 unsigned int curr_interpl_vu = 0;
490 unsigned int next_interpl_vu;
491
492 // weights, calculated in every loop iteration
493 vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
494 vector float vf_next_NSweight_y_upper;
495 vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
496 vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
497 vector float vf_next_NSweight_vu;
498
499 // line indices for the src picture
500 float curr_src_y_upper = 0.0f, next_src_y_upper;
501 float curr_src_y_lower, next_src_y_lower;
502 float curr_src_vu = 0.0f, next_src_vu;
503
504 // line indices for the dst picture
505 unsigned int dst_y=0, dst_vu=0;
506
507 // offset for the v and u plane to handle misalignement
508 unsigned int curr_lsoff_v = 0, next_lsoff_v;
509 unsigned int curr_lsoff_u = 0, next_lsoff_u;
510
511 // calculate lower line idices
512 curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
513 curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
514 // lower line weight
515 vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
516
517
518 // start partially double buffered processing
519 // get initial data, 2 sets of y, 1 set v, 1 set u
520 mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
521 mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
522 (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
523 src_dbl_linestride_y,
524 RETR_BUF,
525 0, 0 );
526 mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
527 mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
528
529 // iteration loop
530 // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
531 // the scaled output is 2 lines y, 1 line v, 1 line u
532 // the yuv2rgb-converted output is stored to RAM
533 for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
534 dst_y = dst_vu<<1;
535
536 // calculate next indices
537 next_src_vu = ((float)dst_vu+1)*y_scale;
538 next_src_y_upper = ((float)dst_y+2)*y_scale;
539 next_src_y_lower = ((float)dst_y+3)*y_scale;
540
541 next_interpl_vu = (unsigned int) next_src_vu;
542 next_interpl_y_upper = (unsigned int) next_src_y_upper;
543 next_interpl_y_lower = (unsigned int) next_src_y_lower;
544
545 // calculate weight NORTH-SOUTH
546 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
547 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
548 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
549
550 // get next lines
551 next_src_idx = curr_src_idx^1;
552 next_dst_idx = curr_dst_idx^1;
553
554 // 4 lines y
555 mfc_get( y_plane[next_src_idx],
556 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
557 src_dbl_linestride_y,
558 RETR_BUF+next_src_idx,
559 0, 0 );
560 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
561 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
562 src_dbl_linestride_y,
563 RETR_BUF+next_src_idx,
564 0, 0 );
565
566 // 2 lines v
567 precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
568 next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
569 mfc_get( v_plane[next_src_idx],
570 ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
571 src_dbl_linestride_vu+(next_lsoff_v<<1),
572 RETR_BUF+next_src_idx,
573 0, 0 );
574 // 2 lines u
575 precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
576 next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
577 mfc_get( u_plane[next_src_idx],
578 ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
579 src_dbl_linestride_vu+(next_lsoff_v<<1),
580 RETR_BUF+next_src_idx,
581 0, 0 );
582
583 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
584
585 // scaling
586 // work line y_upper
587 bilinear_scale_line_w16( y_plane[curr_src_idx],
588 scaled_y_plane[curr_src_idx],
589 dst_width,
590 vf_x_scale,
591 vf_curr_NSweight_y_upper,
592 src_linestride_y );
593 // work line y_lower
594 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
595 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
596 dst_width,
597 vf_x_scale,
598 vf_curr_NSweight_y_lower,
599 src_linestride_y );
600 // work line v
601 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
602 scaled_v_plane[curr_src_idx],
603 dst_width>>1,
604 vf_x_scale,
605 vf_curr_NSweight_vu,
606 src_linestride_vu );
607 // work line u
608 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
609 scaled_u_plane[curr_src_idx],
610 dst_width>>1,
611 vf_x_scale,
612 vf_curr_NSweight_vu,
613 src_linestride_vu );
614
615 //---------------------------------------------------------------------------------------------
616 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
617
618 // Perform three DMA transfers to 3 different locations in the main memory!
619 // dst_width: Pixel width of destination image
620 // dst_addr: Destination address in main memory
621 // dst_vu: Counter which is incremented one by one
622 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
623
624 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
625 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
626 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
627 STR_BUF+curr_dst_idx, // Tag
628 0, 0 );
629
630 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
631 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
632 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
633 STR_BUF+curr_dst_idx, // Tag
634 0, 0 );
635
636 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
637 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
638 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
639 STR_BUF+curr_dst_idx, // Tag
640 0, 0 );
641 //---------------------------------------------------------------------------------------------
642
643
644 // update for next cycle
645 curr_src_idx = next_src_idx;
646 curr_dst_idx = next_dst_idx;
647
648 curr_interpl_y_upper = next_interpl_y_upper;
649 curr_interpl_y_lower = next_interpl_y_lower;
650 curr_interpl_vu = next_interpl_vu;
651
652 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
653 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
654 vf_curr_NSweight_vu = vf_next_NSweight_vu;
655
656 curr_src_y_upper = next_src_y_upper;
657 curr_src_y_lower = next_src_y_lower;
658 curr_src_vu = next_src_vu;
659
660 curr_lsoff_v = next_lsoff_v;
661 curr_lsoff_u = next_lsoff_u;
662 }
663
664
665
666 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
667
668 // scaling
669 // work line y_upper
670 bilinear_scale_line_w16( y_plane[curr_src_idx],
671 scaled_y_plane[curr_src_idx],
672 dst_width,
673 vf_x_scale,
674 vf_curr_NSweight_y_upper,
675 src_linestride_y );
676 // work line y_lower
677 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
678 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
679 dst_width,
680 vf_x_scale,
681 vf_curr_NSweight_y_lower,
682 src_linestride_y );
683 // work line v
684 bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
685 scaled_v_plane[curr_src_idx],
686 dst_width>>1,
687 vf_x_scale,
688 vf_curr_NSweight_vu,
689 src_linestride_vu );
690 // work line u
691 bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
692 scaled_u_plane[curr_src_idx],
693 dst_width>>1,
694 vf_x_scale,
695 vf_curr_NSweight_vu,
696 src_linestride_vu );
697
698 //---------------------------------------------------------------------------------------------
699 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
700
701 // Perform three DMA transfers to 3 different locations in the main memory!
702 // dst_width: Pixel width of destination image
703 // dst_addr: Destination address in main memory
704 // dst_vu: Counter which is incremented one by one
705 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
706
707 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
708 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
709 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
710 STR_BUF+curr_dst_idx, // Tag
711 0, 0 );
712
713 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
714 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
715 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
716 STR_BUF+curr_dst_idx, // Tag
717 0, 0 );
718
719 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
720 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
721 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
722 STR_BUF+curr_dst_idx, // Tag
723 0, 0 );
724
725 // wait for completion
726 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
727 //---------------------------------------------------------------------------------------------
728}
729
730
731/*
732 * scale_srcw32_dstw16()
733 *
734 * processes an input image of width 32
735 * scaling is done to a width 16
736 * yuv2rgb conversion on a width of 16
737 * result stored in RAM
738 */
739void scale_srcw32_dstw16() {
740 // extract parameters
741 unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
742
743 unsigned int src_width = parms.src_pixel_width;
744 unsigned int src_height = parms.src_pixel_height;
745 unsigned int dst_width = parms.dst_pixel_width;
746 unsigned int dst_height = parms.dst_pixel_height;
747
748 // YVU
749 unsigned int src_linestride_y = src_width;
750 unsigned int src_dbl_linestride_y = src_width<<1;
751 unsigned int src_linestride_vu = src_width>>1;
752 unsigned int src_dbl_linestride_vu = src_width;
753 // scaled YVU
754 unsigned int scaled_src_linestride_y = dst_width;
755
756 // ram addresses
757 unsigned char* src_addr_y = parms.y_plane;
758 unsigned char* src_addr_v = parms.v_plane;
759 unsigned char* src_addr_u = parms.u_plane;
760
761 unsigned int dst_picture_size = dst_width*dst_height;
762
763 // Sizes for destination
764 unsigned int dst_dbl_linestride_y = dst_width<<1;
765 unsigned int dst_dbl_linestride_vu = dst_width>>1;
766
767 // Perform address calculation for Y, V and U in main memory with dst_addr as base
768 unsigned char* dst_addr_main_memory_y = dst_addr;
769 unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
770 unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
771
772 // calculate scale factors
773 vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
774 float y_scale = (float)src_height/(float)dst_height;
775
776 // double buffered processing
777 // buffer switching
778 unsigned int curr_src_idx = 0;
779 unsigned int curr_dst_idx = 0;
780 unsigned int next_src_idx, next_dst_idx;
781
782 // 2 lines y as output, upper and lowerline
783 unsigned int curr_interpl_y_upper = 0;
784 unsigned int next_interpl_y_upper;
785 unsigned int curr_interpl_y_lower, next_interpl_y_lower;
786 // only 1 line v/u output, both planes have the same dimension
787 unsigned int curr_interpl_vu = 0;
788 unsigned int next_interpl_vu;
789
790 // weights, calculated in every loop iteration
791 vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
792 vector float vf_next_NSweight_y_upper;
793 vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
794 vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
795 vector float vf_next_NSweight_vu;
796
797 // line indices for the src picture
798 float curr_src_y_upper = 0.0f, next_src_y_upper;
799 float curr_src_y_lower, next_src_y_lower;
800 float curr_src_vu = 0.0f, next_src_vu;
801
802 // line indices for the dst picture
803 unsigned int dst_y=0, dst_vu=0;
804
805 // calculate lower line idices
806 curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
807 curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
808 // lower line weight
809 vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
810
811
812 // start partially double buffered processing
813 // get initial data, 2 sets of y, 1 set v, 1 set u
814 mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
815 mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
816 (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
817 src_dbl_linestride_y,
818 RETR_BUF,
819 0, 0 );
820 mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
821 mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
822
823 // iteration loop
824 // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
825 // the scaled output is 2 lines y, 1 line v, 1 line u
826 // the yuv2rgb-converted output is stored to RAM
827 for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
828 dst_y = dst_vu<<1;
829
830 // calculate next indices
831 next_src_vu = ((float)dst_vu+1)*y_scale;
832 next_src_y_upper = ((float)dst_y+2)*y_scale;
833 next_src_y_lower = ((float)dst_y+3)*y_scale;
834
835 next_interpl_vu = (unsigned int) next_src_vu;
836 next_interpl_y_upper = (unsigned int) next_src_y_upper;
837 next_interpl_y_lower = (unsigned int) next_src_y_lower;
838
839 // calculate weight NORTH-SOUTH
840 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
841 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
842 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
843
844 // get next lines
845 next_src_idx = curr_src_idx^1;
846 next_dst_idx = curr_dst_idx^1;
847
848 // 4 lines y
849 mfc_get( y_plane[next_src_idx],
850 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
851 src_dbl_linestride_y,
852 RETR_BUF+next_src_idx,
853 0, 0 );
854 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
855 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
856 src_dbl_linestride_y,
857 RETR_BUF+next_src_idx,
858 0, 0 );
859
860 // 2 lines v
861 mfc_get( v_plane[next_src_idx],
862 (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
863 src_dbl_linestride_vu,
864 RETR_BUF+next_src_idx,
865 0, 0 );
866 // 2 lines u
867 mfc_get( u_plane[next_src_idx],
868 (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
869 src_dbl_linestride_vu,
870 RETR_BUF+next_src_idx,
871 0, 0 );
872
873 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
874
875 // scaling
876 // work line y_upper
877 bilinear_scale_line_w16( y_plane[curr_src_idx],
878 scaled_y_plane[curr_src_idx],
879 dst_width,
880 vf_x_scale,
881 vf_curr_NSweight_y_upper,
882 src_linestride_y );
883 // work line y_lower
884 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
885 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
886 dst_width,
887 vf_x_scale,
888 vf_curr_NSweight_y_lower,
889 src_linestride_y );
890 // work line v
891 bilinear_scale_line_w16( v_plane[curr_src_idx],
892 scaled_v_plane[curr_src_idx],
893 dst_width>>1,
894 vf_x_scale,
895 vf_curr_NSweight_vu,
896 src_linestride_vu );
897 // work line u
898 bilinear_scale_line_w16( u_plane[curr_src_idx],
899 scaled_u_plane[curr_src_idx],
900 dst_width>>1,
901 vf_x_scale,
902 vf_curr_NSweight_vu,
903 src_linestride_vu );
904
905 //---------------------------------------------------------------------------------------------
906 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
907
908 // Perform three DMA transfers to 3 different locations in the main memory!
909 // dst_width: Pixel width of destination image
910 // dst_addr: Destination address in main memory
911 // dst_vu: Counter which is incremented one by one
912 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
913
914 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
915 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
916 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
917 STR_BUF+curr_dst_idx, // Tag
918 0, 0 );
919
920 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
921 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
922 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
923 STR_BUF+curr_dst_idx, // Tag
924 0, 0 );
925
926 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
927 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
928 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
929 STR_BUF+curr_dst_idx, // Tag
930 0, 0 );
931 //---------------------------------------------------------------------------------------------
932
933
934 // update for next cycle
935 curr_src_idx = next_src_idx;
936 curr_dst_idx = next_dst_idx;
937
938 curr_interpl_y_upper = next_interpl_y_upper;
939 curr_interpl_y_lower = next_interpl_y_lower;
940 curr_interpl_vu = next_interpl_vu;
941
942 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
943 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
944 vf_curr_NSweight_vu = vf_next_NSweight_vu;
945
946 curr_src_y_upper = next_src_y_upper;
947 curr_src_y_lower = next_src_y_lower;
948 curr_src_vu = next_src_vu;
949 }
950
951
952
953 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
954
955 // scaling
956 // work line y_upper
957 bilinear_scale_line_w16( y_plane[curr_src_idx],
958 scaled_y_plane[curr_src_idx],
959 dst_width,
960 vf_x_scale,
961 vf_curr_NSweight_y_upper,
962 src_linestride_y );
963 // work line y_lower
964 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
965 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
966 dst_width,
967 vf_x_scale,
968 vf_curr_NSweight_y_lower,
969 src_linestride_y );
970 // work line v
971 bilinear_scale_line_w16( v_plane[curr_src_idx],
972 scaled_v_plane[curr_src_idx],
973 dst_width>>1,
974 vf_x_scale,
975 vf_curr_NSweight_vu,
976 src_linestride_vu );
977 // work line u
978 bilinear_scale_line_w16( u_plane[curr_src_idx],
979 scaled_u_plane[curr_src_idx],
980 dst_width>>1,
981 vf_x_scale,
982 vf_curr_NSweight_vu,
983 src_linestride_vu );
984
985
986 //---------------------------------------------------------------------------------------------
987 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
988
989 // Perform three DMA transfers to 3 different locations in the main memory!
990 // dst_width: Pixel width of destination image
991 // dst_addr: Destination address in main memory
992 // dst_vu: Counter which is incremented one by one
993 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
994
995 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
996 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
997 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
998 STR_BUF+curr_dst_idx, // Tag
999 0, 0 );
1000
1001 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
1002 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1003 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
1004 STR_BUF+curr_dst_idx, // Tag
1005 0, 0 );
1006
1007 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
1008 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1009 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
1010 STR_BUF+curr_dst_idx, // Tag
1011 0, 0 );
1012
1013 // wait for completion
1014 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1015 //---------------------------------------------------------------------------------------------
1016}
1017
1018
1019/**
1020 * scale_srcw32_dstw32()
1021 *
1022 * processes an input image of width 32
1023 * scaling is done to a width 32
1024 * yuv2rgb conversion on a width of 32
1025 * result stored in RAM
1026 */
1027void scale_srcw32_dstw32() {
1028 // extract parameters
1029 unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
1030
1031 unsigned int src_width = parms.src_pixel_width;
1032 unsigned int src_height = parms.src_pixel_height;
1033 unsigned int dst_width = parms.dst_pixel_width;
1034 unsigned int dst_height = parms.dst_pixel_height;
1035
1036 // YVU
1037 unsigned int src_linestride_y = src_width;
1038 unsigned int src_dbl_linestride_y = src_width<<1;
1039 unsigned int src_linestride_vu = src_width>>1;
1040 unsigned int src_dbl_linestride_vu = src_width;
1041
1042 // scaled YVU
1043 unsigned int scaled_src_linestride_y = dst_width;
1044
1045 // ram addresses
1046 unsigned char* src_addr_y = parms.y_plane;
1047 unsigned char* src_addr_v = parms.v_plane;
1048 unsigned char* src_addr_u = parms.u_plane;
1049
1050 unsigned int dst_picture_size = dst_width*dst_height;
1051
1052 // Sizes for destination
1053 unsigned int dst_dbl_linestride_y = dst_width<<1;
1054 unsigned int dst_dbl_linestride_vu = dst_width>>1;
1055
1056 // Perform address calculation for Y, V and U in main memory with dst_addr as base
1057 unsigned char* dst_addr_main_memory_y = dst_addr;
1058 unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
1059 unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
1060
1061 // calculate scale factors
1062 vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
1063 float y_scale = (float)src_height/(float)dst_height;
1064
1065 // double buffered processing
1066 // buffer switching
1067 unsigned int curr_src_idx = 0;
1068 unsigned int curr_dst_idx = 0;
1069 unsigned int next_src_idx, next_dst_idx;
1070
1071 // 2 lines y as output, upper and lowerline
1072 unsigned int curr_interpl_y_upper = 0;
1073 unsigned int next_interpl_y_upper;
1074 unsigned int curr_interpl_y_lower, next_interpl_y_lower;
1075 // only 1 line v/u output, both planes have the same dimension
1076 unsigned int curr_interpl_vu = 0;
1077 unsigned int next_interpl_vu;
1078
1079 // weights, calculated in every loop iteration
1080 vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
1081 vector float vf_next_NSweight_y_upper;
1082 vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
1083 vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
1084 vector float vf_next_NSweight_vu;
1085
1086 // line indices for the src picture
1087 float curr_src_y_upper = 0.0f, next_src_y_upper;
1088 float curr_src_y_lower, next_src_y_lower;
1089 float curr_src_vu = 0.0f, next_src_vu;
1090
1091 // line indices for the dst picture
1092 unsigned int dst_y=0, dst_vu=0;
1093
1094 // calculate lower line idices
1095 curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
1096 curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
1097 // lower line weight
1098 vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
1099
1100
1101 // start partially double buffered processing
1102 // get initial data, 2 sets of y, 1 set v, 1 set u
1103 mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
1104 mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
1105 (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
1106 src_dbl_linestride_y,
1107 RETR_BUF,
1108 0, 0 );
1109 mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
1110 mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
1111
1112 // iteration loop
1113 // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
1114 // the scaled output is 2 lines y, 1 line v, 1 line u
1115 // the yuv2rgb-converted output is stored to RAM
1116 for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
1117 dst_y = dst_vu<<1;
1118
1119 // calculate next indices
1120 next_src_vu = ((float)dst_vu+1)*y_scale;
1121 next_src_y_upper = ((float)dst_y+2)*y_scale;
1122 next_src_y_lower = ((float)dst_y+3)*y_scale;
1123
1124 next_interpl_vu = (unsigned int) next_src_vu;
1125 next_interpl_y_upper = (unsigned int) next_src_y_upper;
1126 next_interpl_y_lower = (unsigned int) next_src_y_lower;
1127
1128 // calculate weight NORTH-SOUTH
1129 vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
1130 vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
1131 vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
1132
1133 // get next lines
1134 next_src_idx = curr_src_idx^1;
1135 next_dst_idx = curr_dst_idx^1;
1136
1137 // 4 lines y
1138 mfc_get( y_plane[next_src_idx],
1139 (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
1140 src_dbl_linestride_y,
1141 RETR_BUF+next_src_idx,
1142 0, 0 );
1143 mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
1144 (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
1145 src_dbl_linestride_y,
1146 RETR_BUF+next_src_idx,
1147 0, 0 );
1148
1149 // 2 lines v
1150 mfc_get( v_plane[next_src_idx],
1151 (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
1152 src_dbl_linestride_vu,
1153 RETR_BUF+next_src_idx,
1154 0, 0 );
1155 // 2 lines u
1156 mfc_get( u_plane[next_src_idx],
1157 (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
1158 src_dbl_linestride_vu,
1159 RETR_BUF+next_src_idx,
1160 0, 0 );
1161
1162 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
1163
1164 // scaling
1165 // work line y_upper
1166 bilinear_scale_line_w16( y_plane[curr_src_idx],
1167 scaled_y_plane[curr_src_idx],
1168 dst_width,
1169 vf_x_scale,
1170 vf_curr_NSweight_y_upper,
1171 src_linestride_y );
1172 // work line y_lower
1173 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
1174 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
1175 dst_width,
1176 vf_x_scale,
1177 vf_curr_NSweight_y_lower,
1178 src_linestride_y );
1179 // work line v
1180 bilinear_scale_line_w16( v_plane[curr_src_idx],
1181 scaled_v_plane[curr_src_idx],
1182 dst_width>>1,
1183 vf_x_scale,
1184 vf_curr_NSweight_vu,
1185 src_linestride_vu );
1186 // work line u
1187 bilinear_scale_line_w16( u_plane[curr_src_idx],
1188 scaled_u_plane[curr_src_idx],
1189 dst_width>>1,
1190 vf_x_scale,
1191 vf_curr_NSweight_vu,
1192 src_linestride_vu );
1193
1194
1195
1196 // Store the result back to main memory into a destination buffer in YUV format
1197 //---------------------------------------------------------------------------------------------
1198 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1199
1200 // Perform three DMA transfers to 3 different locations in the main memory!
1201 // dst_width: Pixel width of destination image
1202 // dst_addr: Destination address in main memory
1203 // dst_vu: Counter which is incremented one by one
1204 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
1205
1206 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
1207 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
1208 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
1209 STR_BUF+curr_dst_idx, // Tag
1210 0, 0 );
1211
1212 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
1213 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1214 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
1215 STR_BUF+curr_dst_idx, // Tag
1216 0, 0 );
1217
1218 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
1219 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1220 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
1221 STR_BUF+curr_dst_idx, // Tag
1222 0, 0 );
1223 //---------------------------------------------------------------------------------------------
1224
1225
1226 // update for next cycle
1227 curr_src_idx = next_src_idx;
1228 curr_dst_idx = next_dst_idx;
1229
1230 curr_interpl_y_upper = next_interpl_y_upper;
1231 curr_interpl_y_lower = next_interpl_y_lower;
1232 curr_interpl_vu = next_interpl_vu;
1233
1234 vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
1235 vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
1236 vf_curr_NSweight_vu = vf_next_NSweight_vu;
1237
1238 curr_src_y_upper = next_src_y_upper;
1239 curr_src_y_lower = next_src_y_lower;
1240 curr_src_vu = next_src_vu;
1241 }
1242
1243
1244
1245 DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
1246
1247 // scaling
1248 // work line y_upper
1249 bilinear_scale_line_w16( y_plane[curr_src_idx],
1250 scaled_y_plane[curr_src_idx],
1251 dst_width,
1252 vf_x_scale,
1253 vf_curr_NSweight_y_upper,
1254 src_linestride_y );
1255 // work line y_lower
1256 bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
1257 scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
1258 dst_width,
1259 vf_x_scale,
1260 vf_curr_NSweight_y_lower,
1261 src_linestride_y );
1262 // work line v
1263 bilinear_scale_line_w16( v_plane[curr_src_idx],
1264 scaled_v_plane[curr_src_idx],
1265 dst_width>>1,
1266 vf_x_scale,
1267 vf_curr_NSweight_vu,
1268 src_linestride_vu );
1269 // work line u
1270 bilinear_scale_line_w16( u_plane[curr_src_idx],
1271 scaled_u_plane[curr_src_idx],
1272 dst_width>>1,
1273 vf_x_scale,
1274 vf_curr_NSweight_vu,
1275 src_linestride_vu );
1276
1277
1278 // Store the result back to main memory into a destination buffer in YUV format
1279 //---------------------------------------------------------------------------------------------
1280 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1281
1282 // Perform three DMA transfers to 3 different locations in the main memory!
1283 // dst_width: Pixel width of destination image
1284 // dst_addr: Destination address in main memory
1285 // dst_vu: Counter which is incremented one by one
1286 // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
1287
1288 mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
1289 (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
1290 dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
1291 STR_BUF+curr_dst_idx, // Tag
1292 0, 0 );
1293
1294 mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
1295 (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1296 dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
1297 STR_BUF+curr_dst_idx, // Tag
1298 0, 0 );
1299
1300 mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
1301 (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
1302 dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
1303 STR_BUF+curr_dst_idx, // Tag
1304 0, 0 );
1305
1306 // wait for completion
1307 DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
1308 //---------------------------------------------------------------------------------------------
1309}
1310
1311
1312/*
1313 * bilinear_scale_line_w8()
1314 *
1315 * processes a line of yuv-input, width has to be a multiple of 8
1316 * scaled yuv-output is written to local store buffer
1317 *
1318 * @param src buffer for 2 lines input
1319 * @param dst_ buffer for 1 line output
1320 * @param dst_width the width of the destination line
1321 * @param vf_x_scale a float vector, at each entry is the x_scale-factor
1322 * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
1323 * @param src_linestride the stride of the srcline
1324 */
1325void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
1326
1327 unsigned char* dst = dst_;
1328
1329 unsigned int dst_x;
1330 for( dst_x=0; dst_x<dst_width; dst_x+=8) {
1331 // address calculation for loading the 4 surrounding pixel of each calculated
1332 // destination pixel
1333 vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
1334 // lower range->first 4 pixel
1335 // upper range->next 4 pixel
1336 vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
1337 vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
1338 vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
1339 vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
1340
1341 // calculate weight EAST-WEST
1342 vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
1343 vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
1344 vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
1345 vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
1346 vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
1347 vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
1348 vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
1349 vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
1350 vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
1351 vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
1352
1353 // calculate address offset
1354 //
1355 // pixel NORTH WEST
1356 vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
1357 vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
1358
1359 // pixel NORTH EAST-->(offpixelNW+1)
1360 vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
1361 vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
1362 vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
1363
1364 // SOUTH-WEST-->(offpixelNW+src_linestride)
1365 vector unsigned int vui_srclinestride = spu_splats( src_linestride );
1366 vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
1367 vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
1368
1369 // SOUTH-EAST-->(offpixelNW+src_linestride+1)
1370 vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
1371 vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
1372
1373 // calculate each address
1374 vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
1375 vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
1376 vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
1377 vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
1378 vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
1379
1380 vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
1381 vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
1382 vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
1383 vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
1384
1385 // get each pixel
1386 //
1387 // scalar load, afterwards insertion into the right position
1388 // NORTH WEST
1389 vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
1390 vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
1391 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
1392 vuc_pixel_NW_lower_range = spu_insert(
1393 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
1394 vuc_pixel_NW_lower_range, 7 );
1395 vuc_pixel_NW_lower_range = spu_insert(
1396 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
1397 vuc_pixel_NW_lower_range, 11 );
1398 vuc_pixel_NW_lower_range = spu_insert(
1399 *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
1400 vuc_pixel_NW_lower_range, 15 );
1401
1402 vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
1403 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
1404 vuc_pixel_NW_upper_range = spu_insert(
1405 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
1406 vuc_pixel_NW_upper_range, 7 );
1407 vuc_pixel_NW_upper_range = spu_insert(
1408 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
1409 vuc_pixel_NW_upper_range, 11 );
1410 vuc_pixel_NW_upper_range = spu_insert(
1411 *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
1412 vuc_pixel_NW_upper_range, 15 );
1413
1414 // NORTH EAST
1415 vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
1416 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
1417 vuc_pixel_NE_lower_range = spu_insert(
1418 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
1419 vuc_pixel_NE_lower_range, 7 );
1420 vuc_pixel_NE_lower_range = spu_insert(
1421 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
1422 vuc_pixel_NE_lower_range, 11 );
1423 vuc_pixel_NE_lower_range = spu_insert(
1424 *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
1425 vuc_pixel_NE_lower_range, 15 );
1426
1427 vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
1428 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
1429 vuc_pixel_NE_upper_range = spu_insert(
1430 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
1431 vuc_pixel_NE_upper_range, 7 );
1432 vuc_pixel_NE_upper_range = spu_insert(
1433 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
1434 vuc_pixel_NE_upper_range, 11 );
1435 vuc_pixel_NE_upper_range = spu_insert(
1436 *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
1437 vuc_pixel_NE_upper_range, 15 );
1438
1439
1440 // SOUTH WEST
1441 vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
1442 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
1443 vuc_pixel_SW_lower_range = spu_insert(
1444 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
1445 vuc_pixel_SW_lower_range, 7 );
1446 vuc_pixel_SW_lower_range = spu_insert(
1447 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
1448 vuc_pixel_SW_lower_range, 11 );
1449 vuc_pixel_SW_lower_range = spu_insert(
1450 *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
1451 vuc_pixel_SW_lower_range, 15 );
1452
1453 vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
1454 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
1455 vuc_pixel_SW_upper_range = spu_insert(
1456 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
1457 vuc_pixel_SW_upper_range, 7 );
1458 vuc_pixel_SW_upper_range = spu_insert(
1459 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
1460 vuc_pixel_SW_upper_range, 11 );
1461 vuc_pixel_SW_upper_range = spu_insert(
1462 *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
1463 vuc_pixel_SW_upper_range, 15 );
1464
1465 // SOUTH EAST
1466 vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
1467 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
1468 vuc_pixel_SE_lower_range = spu_insert(
1469 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
1470 vuc_pixel_SE_lower_range, 7 );
1471 vuc_pixel_SE_lower_range = spu_insert(
1472 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
1473 vuc_pixel_SE_lower_range, 11 );
1474 vuc_pixel_SE_lower_range = spu_insert(
1475 *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
1476 vuc_pixel_SE_lower_range, 15 );
1477
1478 vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
1479 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
1480 vuc_pixel_SE_upper_range = spu_insert(
1481 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
1482 vuc_pixel_SE_upper_range, 7 );
1483 vuc_pixel_SE_upper_range = spu_insert(
1484 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
1485 vuc_pixel_SE_upper_range, 11 );
1486 vuc_pixel_SE_upper_range = spu_insert(
1487 *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
1488 vuc_pixel_SE_upper_range, 15 );
1489
1490
1491 // convert to float
1492 vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
1493 vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
1494
1495 vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
1496 vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
1497
1498 vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
1499 vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
1500
1501 vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
1502 vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
1503
1504
1505
1506 // first linear interpolation: EWtop
1507 // EWtop = NW + EWweight*(NE-NW)
1508 //
1509 // lower range
1510 vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
1511 vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
1512 vf_EWtop_lower_range_tmp,
1513 vf_pixel_NW_lower_range );
1514
1515 // upper range
1516 vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
1517 vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
1518 vf_EWtop_upper_range_tmp,
1519 vf_pixel_NW_upper_range );
1520
1521
1522
1523 // second linear interpolation: EWbottom
1524 // EWbottom = SW + EWweight*(SE-SW)
1525 //
1526 // lower range
1527 vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
1528 vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
1529 vf_EWbottom_lower_range_tmp,
1530 vf_pixel_SW_lower_range );
1531
1532 // upper range
1533 vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
1534 vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
1535 vf_EWbottom_upper_range_tmp,
1536 vf_pixel_SW_upper_range );
1537
1538
1539
1540 // third linear interpolation: the bilinear interpolated value
1541 // result = EWtop + NSweight*(EWbottom-EWtop);
1542 //
1543 // lower range
1544 vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
1545 vector float vf_result_lower_range = spu_madd( vf_NSweight,
1546 vf_result_lower_range_tmp,
1547 vf_EWtop_lower_range );
1548
1549 // upper range
1550 vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
1551 vector float vf_result_upper_range = spu_madd( vf_NSweight,
1552 vf_result_upper_range_tmp,
1553 vf_EWtop_upper_range );
1554
1555
1556 // convert back: using saturated arithmetic
1557 vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
1558 vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
1559
1560 // merge results->lower,upper
1561 vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
1562 0x13, 0x17, 0x1B, 0x1F,
1563 0x00, 0x00, 0x00, 0x00,
1564 0x00, 0x00, 0x00, 0x00 };
1565
1566 vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
1567 (vector unsigned char) vui_result_upper_range,
1568 vuc_mask_merge_result );
1569
1570 // partial storing
1571 vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
1572 0x00, 0x00, 0x00, 0x00,
1573 0xFF, 0xFF, 0xFF, 0xFF,
1574 0xFF, 0xFF, 0xFF, 0xFF };
1575
1576
1577 // get currently stored data
1578 vector unsigned char vuc_orig = *((vector unsigned char*)dst);
1579
1580 // clear currently stored data
1581 vuc_orig = spu_and( vuc_orig,
1582 spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
1583
1584 // rotate result according to storing address
1585 vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
1586
1587 // store result
1588 *((vector unsigned char*)dst) = spu_or( vuc_result,
1589 vuc_orig );
1590 dst += 8;
1591 }
1592}
1593
1594
1595/*
1596 * bilinear_scale_line_w16()
1597 *
1598 * processes a line of yuv-input, width has to be a multiple of 16
1599 * scaled yuv-output is written to local store buffer
1600 *
1601 * @param src buffer for 2 lines input
1602 * @param dst_ buffer for 1 line output
1603 * @param dst_width the width of the destination line
1604 * @param vf_x_scale a float vector, at each entry is the x_scale-factor
1605 * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
1606 * @param src_linestride the stride of the srcline
1607 */
1608void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
1609
1610 unsigned char* dst = dst_;
1611
1612 unsigned int dst_x;
1613 for( dst_x=0; dst_x<dst_width; dst_x+=16) {
1614 // address calculation for loading the 4 surrounding pixel of each calculated
1615 // destination pixel
1616 vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
1617 // parallelised processing
1618 // first range->pixel 1 2 3 4
1619 // second range->pixel 5 6 7 8
1620 // third range->pixel 9 10 11 12
1621 // fourth range->pixel 13 14 15 16
1622 vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
1623 vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
1624 vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
1625 vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
1626 vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
1627 vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
1628 vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
1629 vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
1630
1631 // calculate weight EAST-WEST
1632 vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
1633 vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
1634 vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
1635 vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
1636 vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
1637 vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
1638 vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
1639 vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
1640 vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
1641 vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
1642 vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
1643 vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
1644 vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
1645 vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
1646 vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
1647 vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
1648 vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
1649 vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
1650 vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
1651 vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
1652
1653 // calculate address offset
1654 //
1655 // pixel NORTH WEST
1656 vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
1657 vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
1658 vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
1659 vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
1660
1661 // pixel NORTH EAST-->(offpixelNW+1)
1662 vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
1663 vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
1664 vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
1665 vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
1666 vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
1667
1668 // SOUTH-WEST-->(offpixelNW+src_linestride)
1669 vector unsigned int vui_srclinestride = spu_splats( src_linestride );
1670 vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
1671 vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
1672 vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
1673 vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
1674
1675 // SOUTH-EAST-->(offpixelNW+src_linestride+1)
1676 vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
1677 vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
1678 vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
1679 vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
1680
1681 // calculate each address
1682 vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
1683 vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
1684 vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
1685 vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
1686 vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
1687
1688 vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
1689 vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
1690 vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
1691 vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
1692
1693 vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
1694 vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
1695 vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
1696 vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
1697
1698 vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
1699 vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
1700 vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
1701 vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
1702
1703
1704 // get each pixel
1705 //
1706 // scalar load, afterwards insertion into the right position
1707 // NORTH WEST
1708 // first range
1709 vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
1710 vector unsigned char vuc_pixel_NW_first_range = spu_insert(
1711 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
1712 vuc_pixel_NW_first_range = spu_insert(
1713 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
1714 vuc_pixel_NW_first_range, 7 );
1715 vuc_pixel_NW_first_range = spu_insert(
1716 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
1717 vuc_pixel_NW_first_range, 11 );
1718 vuc_pixel_NW_first_range = spu_insert(
1719 *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
1720 vuc_pixel_NW_first_range, 15 );
1721 // second range
1722 vector unsigned char vuc_pixel_NW_second_range = spu_insert(
1723 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
1724 vuc_pixel_NW_second_range = spu_insert(
1725 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
1726 vuc_pixel_NW_second_range, 7 );
1727 vuc_pixel_NW_second_range = spu_insert(
1728 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
1729 vuc_pixel_NW_second_range, 11 );
1730 vuc_pixel_NW_second_range = spu_insert(
1731 *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
1732 vuc_pixel_NW_second_range, 15 );
1733 // third range
1734 vector unsigned char vuc_pixel_NW_third_range = spu_insert(
1735 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
1736 vuc_pixel_NW_third_range = spu_insert(
1737 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
1738 vuc_pixel_NW_third_range, 7 );
1739 vuc_pixel_NW_third_range = spu_insert(
1740 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
1741 vuc_pixel_NW_third_range, 11 );
1742 vuc_pixel_NW_third_range = spu_insert(
1743 *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
1744 vuc_pixel_NW_third_range, 15 );
1745 // fourth range
1746 vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
1747 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
1748 vuc_pixel_NW_fourth_range = spu_insert(
1749 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
1750 vuc_pixel_NW_fourth_range, 7 );
1751 vuc_pixel_NW_fourth_range = spu_insert(
1752 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
1753 vuc_pixel_NW_fourth_range, 11 );
1754 vuc_pixel_NW_fourth_range = spu_insert(
1755 *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
1756 vuc_pixel_NW_fourth_range, 15 );
1757
1758 // NORTH EAST
1759 // first range
1760 vector unsigned char vuc_pixel_NE_first_range = spu_insert(
1761 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
1762 vuc_pixel_NE_first_range = spu_insert(
1763 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
1764 vuc_pixel_NE_first_range, 7 );
1765 vuc_pixel_NE_first_range = spu_insert(
1766 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
1767 vuc_pixel_NE_first_range, 11 );
1768 vuc_pixel_NE_first_range = spu_insert(
1769 *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
1770 vuc_pixel_NE_first_range, 15 );
1771 // second range
1772 vector unsigned char vuc_pixel_NE_second_range = spu_insert(
1773 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
1774 vuc_pixel_NE_second_range = spu_insert(
1775 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
1776 vuc_pixel_NE_second_range, 7 );
1777 vuc_pixel_NE_second_range = spu_insert(
1778 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
1779 vuc_pixel_NE_second_range, 11 );
1780 vuc_pixel_NE_second_range = spu_insert(
1781 *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
1782 vuc_pixel_NE_second_range, 15 );
1783 // third range
1784 vector unsigned char vuc_pixel_NE_third_range = spu_insert(
1785 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
1786 vuc_pixel_NE_third_range = spu_insert(
1787 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
1788 vuc_pixel_NE_third_range, 7 );
1789 vuc_pixel_NE_third_range = spu_insert(
1790 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
1791 vuc_pixel_NE_third_range, 11 );
1792 vuc_pixel_NE_third_range = spu_insert(
1793 *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
1794 vuc_pixel_NE_third_range, 15 );
1795 // fourth range
1796 vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
1797 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
1798 vuc_pixel_NE_fourth_range = spu_insert(
1799 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
1800 vuc_pixel_NE_fourth_range, 7 );
1801 vuc_pixel_NE_fourth_range = spu_insert(
1802 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
1803 vuc_pixel_NE_fourth_range, 11 );
1804 vuc_pixel_NE_fourth_range = spu_insert(
1805 *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
1806 vuc_pixel_NE_fourth_range, 15 );
1807
1808 // SOUTH WEST
1809 // first range
1810 vector unsigned char vuc_pixel_SW_first_range = spu_insert(
1811 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
1812 vuc_pixel_SW_first_range = spu_insert(
1813 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
1814 vuc_pixel_SW_first_range, 7 );
1815 vuc_pixel_SW_first_range = spu_insert(
1816 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
1817 vuc_pixel_SW_first_range, 11 );
1818 vuc_pixel_SW_first_range = spu_insert(
1819 *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
1820 vuc_pixel_SW_first_range, 15 );
1821 // second range
1822 vector unsigned char vuc_pixel_SW_second_range = spu_insert(
1823 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
1824 vuc_pixel_SW_second_range = spu_insert(
1825 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
1826 vuc_pixel_SW_second_range, 7 );
1827 vuc_pixel_SW_second_range = spu_insert(
1828 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
1829 vuc_pixel_SW_second_range, 11 );
1830 vuc_pixel_SW_second_range = spu_insert(
1831 *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
1832 vuc_pixel_SW_second_range, 15 );
1833 // third range
1834 vector unsigned char vuc_pixel_SW_third_range = spu_insert(
1835 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
1836 vuc_pixel_SW_third_range = spu_insert(
1837 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
1838 vuc_pixel_SW_third_range, 7 );
1839 vuc_pixel_SW_third_range = spu_insert(
1840 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
1841 vuc_pixel_SW_third_range, 11 );
1842 vuc_pixel_SW_third_range = spu_insert(
1843 *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
1844 vuc_pixel_SW_third_range, 15 );
1845 // fourth range
1846 vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
1847 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
1848 vuc_pixel_SW_fourth_range = spu_insert(
1849 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
1850 vuc_pixel_SW_fourth_range, 7 );
1851 vuc_pixel_SW_fourth_range = spu_insert(
1852 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
1853 vuc_pixel_SW_fourth_range, 11 );
1854 vuc_pixel_SW_fourth_range = spu_insert(
1855 *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
1856 vuc_pixel_SW_fourth_range, 15 );
1857
1858 // NORTH EAST
1859 // first range
1860 vector unsigned char vuc_pixel_SE_first_range = spu_insert(
1861 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
1862 vuc_pixel_SE_first_range = spu_insert(
1863 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
1864 vuc_pixel_SE_first_range, 7 );
1865 vuc_pixel_SE_first_range = spu_insert(
1866 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
1867 vuc_pixel_SE_first_range, 11 );
1868 vuc_pixel_SE_first_range = spu_insert(
1869 *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
1870 vuc_pixel_SE_first_range, 15 );
1871 // second range
1872 vector unsigned char vuc_pixel_SE_second_range = spu_insert(
1873 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
1874 vuc_pixel_SE_second_range = spu_insert(
1875 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
1876 vuc_pixel_SE_second_range, 7 );
1877 vuc_pixel_SE_second_range = spu_insert(
1878 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
1879 vuc_pixel_SE_second_range, 11 );
1880 vuc_pixel_SE_second_range = spu_insert(
1881 *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
1882 vuc_pixel_SE_second_range, 15 );
1883 // third range
1884 vector unsigned char vuc_pixel_SE_third_range = spu_insert(
1885 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
1886 vuc_pixel_SE_third_range = spu_insert(
1887 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
1888 vuc_pixel_SE_third_range, 7 );
1889 vuc_pixel_SE_third_range = spu_insert(
1890 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
1891 vuc_pixel_SE_third_range, 11 );
1892 vuc_pixel_SE_third_range = spu_insert(
1893 *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
1894 vuc_pixel_SE_third_range, 15 );
1895 // fourth range
1896 vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
1897 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
1898 vuc_pixel_SE_fourth_range = spu_insert(
1899 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
1900 vuc_pixel_SE_fourth_range, 7 );
1901 vuc_pixel_SE_fourth_range = spu_insert(
1902 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
1903 vuc_pixel_SE_fourth_range, 11 );
1904 vuc_pixel_SE_fourth_range = spu_insert(
1905 *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
1906 vuc_pixel_SE_fourth_range, 15 );
1907
1908
1909
1910 // convert to float
1911 vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
1912 vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
1913 vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
1914 vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
1915
1916 vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
1917 vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
1918 vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
1919 vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
1920
1921 vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
1922 vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
1923 vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
1924 vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
1925
1926 vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
1927 vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
1928 vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
1929 vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
1930
1931 // first linear interpolation: EWtop
1932 // EWtop = NW + EWweight*(NE-NW)
1933 //
1934 // first range
1935 vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
1936 vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
1937 vf_EWtop_first_range_tmp,
1938 vf_pixel_NW_first_range );
1939
1940 // second range
1941 vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
1942 vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
1943 vf_EWtop_second_range_tmp,
1944 vf_pixel_NW_second_range );
1945
1946 // third range
1947 vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
1948 vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
1949 vf_EWtop_third_range_tmp,
1950 vf_pixel_NW_third_range );
1951
1952 // fourth range
1953 vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
1954 vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
1955 vf_EWtop_fourth_range_tmp,
1956 vf_pixel_NW_fourth_range );
1957
1958
1959
1960 // second linear interpolation: EWbottom
1961 // EWbottom = SW + EWweight*(SE-SW)
1962 //
1963 // first range
1964 vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
1965 vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
1966 vf_EWbottom_first_range_tmp,
1967 vf_pixel_SW_first_range );
1968
1969 // second range
1970 vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
1971 vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
1972 vf_EWbottom_second_range_tmp,
1973 vf_pixel_SW_second_range );
1974 // first range
1975 vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
1976 vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
1977 vf_EWbottom_third_range_tmp,
1978 vf_pixel_SW_third_range );
1979
1980 // first range
1981 vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
1982 vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
1983 vf_EWbottom_fourth_range_tmp,
1984 vf_pixel_SW_fourth_range );
1985
1986
1987
1988 // third linear interpolation: the bilinear interpolated value
1989 // result = EWtop + NSweight*(EWbottom-EWtop);
1990 //
1991 // first range
1992 vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
1993 vector float vf_result_first_range = spu_madd( vf_NSweight,
1994 vf_result_first_range_tmp,
1995 vf_EWtop_first_range );
1996
1997 // second range
1998 vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
1999 vector float vf_result_second_range = spu_madd( vf_NSweight,
2000 vf_result_second_range_tmp,
2001 vf_EWtop_second_range );
2002
2003 // third range
2004 vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
2005 vector float vf_result_third_range = spu_madd( vf_NSweight,
2006 vf_result_third_range_tmp,
2007 vf_EWtop_third_range );
2008
2009 // fourth range
2010 vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
2011 vector float vf_result_fourth_range = spu_madd( vf_NSweight,
2012 vf_result_fourth_range_tmp,
2013 vf_EWtop_fourth_range );
2014
2015
2016
2017 // convert back: using saturated arithmetic
2018 vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
2019 vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
2020 vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
2021 vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
2022
2023 // merge results->lower,upper
2024 vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
2025 0x13, 0x17, 0x1B, 0x1F,
2026 0x00, 0x00, 0x00, 0x00,
2027 0x00, 0x00, 0x00, 0x00 };
2028
2029 vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
2030 0x00, 0x00, 0x00, 0x00,
2031 0x03, 0x07, 0x0B, 0x0F,
2032 0x13, 0x17, 0x1B, 0x1F };
2033
2034 vector unsigned char vuc_result_first_second =
2035 spu_shuffle( (vector unsigned char) vui_result_first_range,
2036 (vector unsigned char) vui_result_second_range,
2037 vuc_mask_merge_result_first_second );
2038
2039 vector unsigned char vuc_result_third_fourth =
2040 spu_shuffle( (vector unsigned char) vui_result_third_range,
2041 (vector unsigned char) vui_result_fourth_range,
2042 vuc_mask_merge_result_third_fourth );
2043
2044 // store result
2045 *((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
2046 vuc_result_third_fourth );
2047 dst += 16;
2048 }
2049}
2050