Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
/*
2
 * Copyright (C) 2007 Marc Hoffman 
3
 *                    April 20, 2007
4
 *
5
 * Blackfin video color space converter operations
6
 * convert I420 YV12 to RGB in various formats
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24
 
25
 
26
/*
27
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29
 
30
 
31
The following calculation is used for the conversion:
32
 
33
  r = clipz((y - oy) * cy  + crv * (v - 128))
34
  g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
35
  b = clipz((y - oy) * cy  + cbu * (u - 128))
36
 
37
y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38
 
39
 
40
New factorization to eliminate the truncation error which was
41
occurring due to the byteop3p.
42
 
43
 
44
1) Use the bytop16m to subtract quad bytes we use this in U8 this
45
 then so the offsets need to be renormalized to 8bits.
46
 
47
2) Scale operands up by a factor of 4 not 8 because Blackfin
48
   multiplies include a shift.
49
 
50
3) Compute into the accumulators cy * yx0, cy * yx1.
51
 
52
4) Compute each of the linear equations:
53
     r = clipz((y - oy) * cy  + crv * (v - 128))
54
 
55
     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
56
 
57
     b = clipz((y - oy) * cy  + cbu * (u - 128))
58
 
59
   Reuse of the accumulators requires that we actually multiply
60
   twice once with addition and the second time with a subtraction.
61
 
62
   Because of this we need to compute the equations in the order R B
63
   then G saving the writes for B in the case of 24/32 bit color
64
   formats.
65
 
66
   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67
                      int dW, uint32_t *coeffs);
68
 
69
       A          B
70
       ---        ---
71
       i2 = cb    i3 = cr
72
       i1 = coeff i0 = y
73
 
74
Where coeffs have the following layout in memory.
75
 
76
uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
77
 
78
coeffs is a pointer to oy.
79
 
80
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81
replication is used to simplify the internal algorithms for the dual Mac
82
architecture of BlackFin.
83
 
84
All routines are exported with _ff_bfin_ as a symbol prefix.
85
 
86
Rough performance gain compared against -O3:
87
 
88
2779809/1484290 187.28%
89
 
90
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91
c/pel for the optimized implementations. Not sure why there is such a
92
huge variation on the reference codes on Blackfin I guess it must have
93
to do with the memory system.
94
*/
95
 
96
#define mL3 .text
97
#if defined(__FDPIC__) && CONFIG_SRAM
98
#define mL1 .l1.text
99
#else
100
#define mL1 mL3
101
#endif
102
#define MEM mL1
103
 
104
#define DEFUN(fname,where,interface) \
105
        .section where;              \
106
        .global _ff_bfin_ ## fname;  \
107
        .type _ff_bfin_ ## fname, STT_FUNC; \
108
        .align 8;                    \
109
        _ff_bfin_ ## fname
110
 
111
#define DEFUN_END(fname) \
112
        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113
 
114
 
115
.text
116
 
117
#define COEFF_LEN        11*4
118
#define COEFF_REL_CY_OFF 4*4
119
 
120
#define ARG_OUT   20
121
#define ARG_W     24
122
#define ARG_COEFF 28
123
 
124
DEFUN(yuv2rgb565_line,MEM,
125
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126
        link 0;
127
        [--sp] = (r7:4);
128
        p1 = [fp+ARG_OUT];
129
        r3 = [fp+ARG_W];
130
 
131
        i0 = r0;
132
        i2 = r1;
133
        i3 = r2;
134
 
135
        r0 = [fp+ARG_COEFF];
136
        i1 = r0;
137
        b1 = i1;
138
        l1 = COEFF_LEN;
139
        m0 = COEFF_REL_CY_OFF;
140
        p0 = r3;
141
 
142
        r0   = [i0++];         // 2Y
143
        r1.l = w[i2++];        // 2u
144
        r1.h = w[i3++];        // 2v
145
        p0 = p0>>2;
146
 
147
        lsetup (.L0565, .L1565) lc0 = p0;
148
 
149
        /*
150
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151
           r0 -- used to load 4ys
152
           r1 -- used to load 2us,2vs
153
           r4 -- y3,y2
154
           r5 -- y1,y0
155
           r6 -- u1,u0
156
           r7 -- v1,v0
157
        */
158
                                                              r2=[i1++]; // oy
159
.L0565:
160
        /*
161
        rrrrrrrr gggggggg bbbbbbbb
162
         5432109876543210
163
                    bbbbb >>3
164
              gggggggg    <<3
165
         rrrrrrrr         <<8
166
         rrrrrggggggbbbbb
167
        */
168
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
169
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
170
        r5 = r5 << 2 (v);                                                // y1,y0
171
        r4 = r4 << 2 (v);                                                // y3,y2
172
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
173
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
174
        /* Y' = y*cy */
175
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
176
 
177
        /* R = Y+ crv*(Cr-128) */
178
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
180
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
181
        r2 = r2 >> 3 (v);
182
        r3 = r2 & r5;
183
 
184
        /* B = Y+ cbu*(Cb-128) */
185
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
187
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
188
        r2 = r2 << 8 (v);
189
        r2 = r2 & r5;
190
        r3 = r3 | r2;
191
 
192
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
194
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
196
        r2 = r2 << 3 (v);
197
        r2 = r2 & r5;
198
        r3 = r3 | r2;
199
        [p1++]=r3                                          || r1=[i1++]; // cy
200
 
201
        /* Y' = y*cy */
202
 
203
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
204
 
205
        /* R = Y+ crv*(Cr-128) */
206
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
208
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
209
        r2 = r2 >> 3 (v);
210
        r3 = r2 & r5;
211
 
212
        /* B = Y+ cbu*(Cb-128) */
213
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
215
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
216
        r2 = r2 << 8 (v);
217
        r2 = r2 & r5;
218
        r3 = r3 | r2;
219
 
220
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
222
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
224
        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
225
        r2 = r2 & r5;
226
        r3 = r3 | r2;
227
        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
228
.L1565:                                                       r2=[i1++]; // oy
229
 
230
        l1 = 0;
231
 
232
        (r7:4) = [sp++];
233
        unlink;
234
        rts;
235
DEFUN_END(yuv2rgb565_line)
236
 
237
DEFUN(yuv2rgb555_line,MEM,
238
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239
        link 0;
240
        [--sp] = (r7:4);
241
        p1 = [fp+ARG_OUT];
242
        r3 = [fp+ARG_W];
243
 
244
        i0 = r0;
245
        i2 = r1;
246
        i3 = r2;
247
 
248
        r0 = [fp+ARG_COEFF];
249
        i1 = r0;
250
        b1 = i1;
251
        l1 = COEFF_LEN;
252
        m0 = COEFF_REL_CY_OFF;
253
        p0 = r3;
254
 
255
        r0   = [i0++];         // 2Y
256
        r1.l = w[i2++];        // 2u
257
        r1.h = w[i3++];        // 2v
258
        p0 = p0>>2;
259
 
260
        lsetup (.L0555, .L1555) lc0 = p0;
261
 
262
        /*
263
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264
           r0 -- used to load 4ys
265
           r1 -- used to load 2us,2vs
266
           r4 -- y3,y2
267
           r5 -- y1,y0
268
           r6 -- u1,u0
269
           r7 -- v1,v0
270
        */
271
                                                              r2=[i1++]; // oy
272
.L0555:
273
        /*
274
        rrrrrrrr gggggggg bbbbbbbb
275
         5432109876543210
276
                    bbbbb >>3
277
               gggggggg   <<2
278
          rrrrrrrr        <<7
279
         xrrrrrgggggbbbbb
280
        */
281
 
282
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
283
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
284
        r5 = r5 << 2 (v);                                                // y1,y0
285
        r4 = r4 << 2 (v);                                                // y3,y2
286
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
287
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
288
        /* Y' = y*cy */
289
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
290
 
291
        /* R = Y+ crv*(Cr-128) */
292
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
294
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
295
        r2 = r2 >> 3 (v);
296
        r3 = r2 & r5;
297
 
298
        /* B = Y+ cbu*(Cb-128) */
299
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
301
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
302
        r2 = r2 << 7 (v);
303
        r2 = r2 & r5;
304
        r3 = r3 | r2;
305
 
306
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
308
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
310
        r2 = r2 << 2 (v);
311
        r2 = r2 & r5;
312
        r3 = r3 | r2;
313
        [p1++]=r3                                          || r1=[i1++]; // cy
314
 
315
        /* Y' = y*cy */
316
 
317
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
318
 
319
        /* R = Y+ crv*(Cr-128) */
320
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
322
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
323
        r2 = r2 >> 3 (v);
324
        r3 = r2 & r5;
325
 
326
        /* B = Y+ cbu*(Cb-128) */
327
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
329
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
330
        r2 = r2 << 7 (v);
331
        r2 = r2 & r5;
332
        r3 = r3 | r2;
333
 
334
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
336
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
338
        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
339
        r2 = r2 & r5;
340
        r3 = r3 | r2;
341
        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
342
 
343
.L1555:                                                       r2=[i1++]; // oy
344
 
345
        l1 = 0;
346
 
347
        (r7:4) = [sp++];
348
        unlink;
349
        rts;
350
DEFUN_END(yuv2rgb555_line)
351
 
352
DEFUN(yuv2rgb24_line,MEM,
353
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354
        link 0;
355
        [--sp] = (r7:4);
356
        p1 = [fp+ARG_OUT];
357
        r3 = [fp+ARG_W];
358
        p2 = p1;
359
        p2 += 3;
360
 
361
        i0 = r0;
362
        i2 = r1;
363
        i3 = r2;
364
 
365
        r0 = [fp+ARG_COEFF]; // coeff buffer
366
        i1 = r0;
367
        b1 = i1;
368
        l1 = COEFF_LEN;
369
        m0 = COEFF_REL_CY_OFF;
370
        p0 = r3;
371
 
372
        r0   = [i0++];         // 2Y
373
        r1.l = w[i2++];        // 2u
374
        r1.h = w[i3++];        // 2v
375
        p0 = p0>>2;
376
 
377
        lsetup (.L0888, .L1888) lc0 = p0;
378
 
379
        /*
380
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381
           r0 -- used to load 4ys
382
           r1 -- used to load 2us,2vs
383
           r4 -- y3,y2
384
           r5 -- y1,y0
385
           r6 -- u1,u0
386
           r7 -- v1,v0
387
        */
388
                                                              r2=[i1++]; // oy
389
.L0888:
390
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
391
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
392
        r5 = r5 << 2 (v);               // y1,y0
393
        r4 = r4 << 2 (v);               // y3,y2
394
        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
395
        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
396
 
397
        /* Y' = y*cy */
398
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
399
 
400
        /* R = Y+ crv*(Cr-128) */
401
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
403
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
404
        r2=r2>>16 || B[p1++]=r2;
405
                     B[p2++]=r2;
406
 
407
        /* B = Y+ cbu*(Cb-128) */
408
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
410
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
411
 
412
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
414
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
416
 
417
        r2=r2>>16 || B[p1++]=r2;
418
                     B[p2++]=r2;
419
 
420
        r3=r3>>16 || B[p1++]=r3;
421
                     B[p2++]=r3                            || r1=[i1++]; // cy
422
 
423
        p1+=3;
424
        p2+=3;
425
        /* Y' = y*cy */
426
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
427
 
428
        /* R = Y+ crv*(Cr-128) */
429
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
431
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
432
        r2=r2>>16 || B[p1++]=r2;
433
        B[p2++]=r2;
434
 
435
        /* B = Y+ cbu*(Cb-128) */
436
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
438
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
439
 
440
        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
442
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
444
        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
445
                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
446
        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447
                     B[p2++]=r3 || r2=[i1++];      // oy
448
 
449
        p1+=3;
450
.L1888: p2+=3;
451
 
452
        l1 = 0;
453
 
454
        (r7:4) = [sp++];
455
        unlink;
456
        rts;
457
DEFUN_END(yuv2rgb24_line)
458
 
459
 
460
 
461
#define ARG_vdst        20
462
#define ARG_width       24
463
#define ARG_height      28
464
#define ARG_lumStride   32
465
#define ARG_chromStride 36
466
#define ARG_srcStride   40
467
 
468
DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469
                         int width, int height,
470
                         int lumStride, int chromStride, int srcStride)):
471
        link 0;
472
        [--sp] = (r7:4,p5:4);
473
 
474
        p0 = r1;       // Y top even
475
 
476
        i2 = r2; // *u
477
        r2 = [fp + ARG_vdst];
478
        i3 = r2; // *v
479
 
480
        r1 = [fp + ARG_srcStride];
481
        r2 = r0 + r1;
482
        i0 = r0;  // uyvy_T even
483
        i1 = r2;  // uyvy_B odd
484
 
485
        p2 = [fp + ARG_lumStride];
486
        p1 = p0 + p2;  // Y bot odd
487
 
488
        p5 = [fp + ARG_width];
489
        p4 = [fp + ARG_height];
490
        r0 = p5;
491
        p4 = p4 >> 1;
492
        p5 = p5 >> 2;
493
 
494
        r2 = r0 << 1;
495
        r1 = r1 << 1;
496
        r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
497
        r1 += -8;  // i0,i1 is pre read need to correct
498
        m0 = r1;
499
 
500
        r2 = [fp + ARG_chromStride];
501
        r0 = r0 >> 1;
502
        r2 = r2 - r0;
503
        m1 = r2;
504
 
505
        /*   I0,I1 - src input line pointers
506
         *   p0,p1 - luma output line pointers
507
         *   I2    - dstU
508
         *   I3    - dstV
509
         */
510
 
511
        lsetup (0f, 1f) lc1 = p4;   // H/2
512
0:        r0 = [i0++] || r2 = [i1++];
513
          r1 = [i0++] || r3 = [i1++];
514
          r4 = byteop1p(r1:0, r3:2);
515
          r5 = byteop1p(r1:0, r3:2) (r);
516
          lsetup (2f, 3f) lc0 = p5; // W/4
517
2:          r0 = r0 >> 8(v);
518
            r1 = r1 >> 8(v);
519
            r2 = r2 >> 8(v);
520
            r3 = r3 >> 8(v);
521
            r0 = bytepack(r0, r1);
522
            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
523
            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
524
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
525
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
526
            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
527
3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
528
 
529
          i0 += m0;
530
          i1 += m0;
531
          i2 += m1;
532
          i3 += m1;
533
          p0 = p0 + p2;
534
1:        p1 = p1 + p2;
535
 
536
        (r7:4,p5:4) = [sp++];
537
        unlink;
538
        rts;
539
DEFUN_END(uyvytoyv12)
540
 
541
DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
542
                         int width, int height,
543
                         int lumStride, int chromStride, int srcStride)):
544
        link 0;
545
        [--sp] = (r7:4,p5:4);
546
 
547
        p0 = r1;       // Y top even
548
 
549
        i2 = r2; // *u
550
        r2 = [fp + ARG_vdst];
551
        i3 = r2; // *v
552
 
553
        r1 = [fp + ARG_srcStride];
554
        r2 = r0 + r1;
555
 
556
        i0 = r0;  // uyvy_T even
557
        i1 = r2;  // uyvy_B odd
558
 
559
        p2 = [fp + ARG_lumStride];
560
        p1 = p0 + p2;  // Y bot odd
561
 
562
        p5 = [fp + ARG_width];
563
        p4 = [fp + ARG_height];
564
        r0 = p5;
565
        p4 = p4 >> 1;
566
        p5 = p5 >> 2;
567
 
568
        r2 = r0 << 1;
569
        r1 = r1 << 1;
570
        r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
571
        r1 += -8;  // i0,i1 is pre read need to correct
572
        m0 = r1;
573
 
574
        r2 = [fp + ARG_chromStride];
575
        r0 = r0 >> 1;
576
        r2 = r2 - r0;
577
        m1 = r2;
578
 
579
        /*   I0,I1 - src input line pointers
580
         *   p0,p1 - luma output line pointers
581
         *   I2    - dstU
582
         *   I3    - dstV
583
         */
584
 
585
        lsetup (0f, 1f) lc1 = p4;   // H/2
586
0:        r0 = [i0++] || r2 = [i1++];
587
          r1 = [i0++] || r3 = [i1++];
588
          r4 = bytepack(r0, r1);
589
          r5 = bytepack(r2, r3);
590
          lsetup (2f, 3f) lc0 = p5; // W/4
591
2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
592
            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
593
            r2 = r2 >> 8(v);
594
            r3 = r3 >> 8(v);
595
            r4 = byteop1p(r1:0, r3:2);
596
            r5 = byteop1p(r1:0, r3:2) (r);
597
            r6 = pack(r5.l, r4.l);
598
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
599
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
600
            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
601
3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
602
 
603
          i0 += m0;
604
          i1 += m0;
605
          i2 += m1;
606
          i3 += m1;
607
          p0 = p0 + p2;
608
1:        p1 = p1 + p2;
609
 
610
        (r7:4,p5:4) = [sp++];
611
        unlink;
612
        rts;
613
DEFUN_END(yuyvtoyv12)