Update to MPlayer SVN rev 29978 and FFmpeg SVN rev 20757.
[vaapi:challenzhous-mplayer.git] / libswscale / .svn / text-base / swscale_template.c.svn-base
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  *
20  * The C code (not assembly, MMX, ...) of this file can be used
21  * under the LGPL license.
22  */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29
30 #if COMPILE_TEMPLATE_AMD3DNOW
31 #define PREFETCH  "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif COMPILE_TEMPLATE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
36 #else
37 #define PREFETCH  " # nop"
38 #define PREFETCHW " # nop"
39 #endif
40
41 #if COMPILE_TEMPLATE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif COMPILE_TEMPLATE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45 #endif
46
47 #if COMPILE_TEMPLATE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
49 #else
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
51 #endif
52 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
53
54 #if COMPILE_TEMPLATE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
56 #endif
57
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
59     __asm__ volatile(\
60         "xor                          %%"REG_a", %%"REG_a"  \n\t"\
61         "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
62         "movq                             %%mm3, %%mm4      \n\t"\
63         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
64         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65         ASMALIGN(4) /* FIXME Unroll? */\
66         "1:                                                 \n\t"\
67         "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
68         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
69         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
70         "add                                $16, %%"REG_d"  \n\t"\
71         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
72         "test                         %%"REG_S", %%"REG_S"  \n\t"\
73         "pmulhw                           %%mm0, %%mm2      \n\t"\
74         "pmulhw                           %%mm0, %%mm5      \n\t"\
75         "paddw                            %%mm2, %%mm3      \n\t"\
76         "paddw                            %%mm5, %%mm4      \n\t"\
77         " jnz                                1b             \n\t"\
78         "psraw                               $3, %%mm3      \n\t"\
79         "psraw                               $3, %%mm4      \n\t"\
80         "packuswb                         %%mm4, %%mm3      \n\t"\
81         MOVNTQ(%%mm3, (%1, %%REGa))\
82         "add                                 $8, %%"REG_a"  \n\t"\
83         "cmp                                 %2, %%"REG_a"  \n\t"\
84         "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
85         "movq                             %%mm3, %%mm4      \n\t"\
86         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
87         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
88         "jb                                  1b             \n\t"\
89         :: "r" (&c->redDither),\
90         "r" (dest), "g" (width)\
91         : "%"REG_a, "%"REG_d, "%"REG_S\
92     );
93
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
95     __asm__ volatile(\
96         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
97         "xor                          %%"REG_a", %%"REG_a"  \n\t"\
98         "pxor                             %%mm4, %%mm4      \n\t"\
99         "pxor                             %%mm5, %%mm5      \n\t"\
100         "pxor                             %%mm6, %%mm6      \n\t"\
101         "pxor                             %%mm7, %%mm7      \n\t"\
102         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103         ASMALIGN(4) \
104         "1:                                                 \n\t"\
105         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
106         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
107         "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
108         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
109         "movq                             %%mm0, %%mm3      \n\t"\
110         "punpcklwd                        %%mm1, %%mm0      \n\t"\
111         "punpckhwd                        %%mm1, %%mm3      \n\t"\
112         "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
113         "pmaddwd                          %%mm1, %%mm0      \n\t"\
114         "pmaddwd                          %%mm1, %%mm3      \n\t"\
115         "paddd                            %%mm0, %%mm4      \n\t"\
116         "paddd                            %%mm3, %%mm5      \n\t"\
117         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
118         "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
119         "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
120         "test                         %%"REG_S", %%"REG_S"  \n\t"\
121         "movq                             %%mm2, %%mm0      \n\t"\
122         "punpcklwd                        %%mm3, %%mm2      \n\t"\
123         "punpckhwd                        %%mm3, %%mm0      \n\t"\
124         "pmaddwd                          %%mm1, %%mm2      \n\t"\
125         "pmaddwd                          %%mm1, %%mm0      \n\t"\
126         "paddd                            %%mm2, %%mm6      \n\t"\
127         "paddd                            %%mm0, %%mm7      \n\t"\
128         " jnz                                1b             \n\t"\
129         "psrad                              $16, %%mm4      \n\t"\
130         "psrad                              $16, %%mm5      \n\t"\
131         "psrad                              $16, %%mm6      \n\t"\
132         "psrad                              $16, %%mm7      \n\t"\
133         "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
134         "packssdw                         %%mm5, %%mm4      \n\t"\
135         "packssdw                         %%mm7, %%mm6      \n\t"\
136         "paddw                            %%mm0, %%mm4      \n\t"\
137         "paddw                            %%mm0, %%mm6      \n\t"\
138         "psraw                               $3, %%mm4      \n\t"\
139         "psraw                               $3, %%mm6      \n\t"\
140         "packuswb                         %%mm6, %%mm4      \n\t"\
141         MOVNTQ(%%mm4, (%1, %%REGa))\
142         "add                                 $8, %%"REG_a"  \n\t"\
143         "cmp                                 %2, %%"REG_a"  \n\t"\
144         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
145         "pxor                             %%mm4, %%mm4      \n\t"\
146         "pxor                             %%mm5, %%mm5      \n\t"\
147         "pxor                             %%mm6, %%mm6      \n\t"\
148         "pxor                             %%mm7, %%mm7      \n\t"\
149         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
150         "jb                                  1b             \n\t"\
151         :: "r" (&c->redDither),\
152         "r" (dest), "g" (width)\
153         : "%"REG_a, "%"REG_d, "%"REG_S\
154     );
155
156 #define YSCALEYUV2YV121 \
157     "mov %2, %%"REG_a"                    \n\t"\
158     ASMALIGN(4) /* FIXME Unroll? */\
159     "1:                                   \n\t"\
160     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
161     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
162     "psraw                 $7, %%mm0      \n\t"\
163     "psraw                 $7, %%mm1      \n\t"\
164     "packuswb           %%mm1, %%mm0      \n\t"\
165     MOVNTQ(%%mm0, (%1, %%REGa))\
166     "add                   $8, %%"REG_a"  \n\t"\
167     "jnc                   1b             \n\t"
168
169 #define YSCALEYUV2YV121_ACCURATE \
170     "mov %2, %%"REG_a"                    \n\t"\
171     "pcmpeqw %%mm7, %%mm7                 \n\t"\
172     "psrlw                 $15, %%mm7     \n\t"\
173     "psllw                  $6, %%mm7     \n\t"\
174     ASMALIGN(4) /* FIXME Unroll? */\
175     "1:                                   \n\t"\
176     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
177     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
178     "paddsw             %%mm7, %%mm0      \n\t"\
179     "paddsw             %%mm7, %%mm1      \n\t"\
180     "psraw                 $7, %%mm0      \n\t"\
181     "psraw                 $7, %%mm1      \n\t"\
182     "packuswb           %%mm1, %%mm0      \n\t"\
183     MOVNTQ(%%mm0, (%1, %%REGa))\
184     "add                   $8, %%"REG_a"  \n\t"\
185     "jnc                   1b             \n\t"
186
187 /*
188     :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189        "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190        "r" (dest), "m" (dstW),
191        "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192     : "%eax", "%ebx", "%ecx", "%edx", "%esi"
193 */
194 #define YSCALEYUV2PACKEDX_UV \
195     __asm__ volatile(\
196         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
197         ASMALIGN(4)\
198         "nop                                            \n\t"\
199         "1:                                             \n\t"\
200         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
201         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
202         "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
203         "movq                      %%mm3, %%mm4         \n\t"\
204         ASMALIGN(4)\
205         "2:                                             \n\t"\
206         "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
207         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
208         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
209         "add                         $16, %%"REG_d"     \n\t"\
210         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
211         "pmulhw                    %%mm0, %%mm2         \n\t"\
212         "pmulhw                    %%mm0, %%mm5         \n\t"\
213         "paddw                     %%mm2, %%mm3         \n\t"\
214         "paddw                     %%mm5, %%mm4         \n\t"\
215         "test                  %%"REG_S", %%"REG_S"     \n\t"\
216         " jnz                         2b                \n\t"\
217
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219     "lea                "offset"(%0), %%"REG_d"     \n\t"\
220     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
221     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
222     "movq                    "#dst1", "#dst2"       \n\t"\
223     ASMALIGN(4)\
224     "2:                                             \n\t"\
225     "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
226     "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
227     "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
228     "add                         $16, %%"REG_d"            \n\t"\
229     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
230     "pmulhw                 "#coeff", "#src1"       \n\t"\
231     "pmulhw                 "#coeff", "#src2"       \n\t"\
232     "paddw                   "#src1", "#dst1"       \n\t"\
233     "paddw                   "#src2", "#dst2"       \n\t"\
234     "test                  %%"REG_S", %%"REG_S"     \n\t"\
235     " jnz                         2b                \n\t"\
236
237 #define YSCALEYUV2PACKEDX \
238     YSCALEYUV2PACKEDX_UV \
239     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
240
241 #define YSCALEYUV2PACKEDX_END                     \
242         :: "r" (&c->redDither),                   \
243             "m" (dummy), "m" (dummy), "m" (dummy),\
244             "r" (dest), "m" (dstW)                \
245         : "%"REG_a, "%"REG_d, "%"REG_S            \
246     );
247
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
249     __asm__ volatile(\
250         "xor %%"REG_a", %%"REG_a"                       \n\t"\
251         ASMALIGN(4)\
252         "nop                                            \n\t"\
253         "1:                                             \n\t"\
254         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
255         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
256         "pxor                      %%mm4, %%mm4         \n\t"\
257         "pxor                      %%mm5, %%mm5         \n\t"\
258         "pxor                      %%mm6, %%mm6         \n\t"\
259         "pxor                      %%mm7, %%mm7         \n\t"\
260         ASMALIGN(4)\
261         "2:                                             \n\t"\
262         "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
263         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
264         "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
265         "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
266         "movq                      %%mm0, %%mm3         \n\t"\
267         "punpcklwd                 %%mm1, %%mm0         \n\t"\
268         "punpckhwd                 %%mm1, %%mm3         \n\t"\
269         "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
270         "pmaddwd                   %%mm1, %%mm0         \n\t"\
271         "pmaddwd                   %%mm1, %%mm3         \n\t"\
272         "paddd                     %%mm0, %%mm4         \n\t"\
273         "paddd                     %%mm3, %%mm5         \n\t"\
274         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
275         "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
276         "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
277         "test                  %%"REG_S", %%"REG_S"     \n\t"\
278         "movq                      %%mm2, %%mm0         \n\t"\
279         "punpcklwd                 %%mm3, %%mm2         \n\t"\
280         "punpckhwd                 %%mm3, %%mm0         \n\t"\
281         "pmaddwd                   %%mm1, %%mm2         \n\t"\
282         "pmaddwd                   %%mm1, %%mm0         \n\t"\
283         "paddd                     %%mm2, %%mm6         \n\t"\
284         "paddd                     %%mm0, %%mm7         \n\t"\
285         " jnz                         2b                \n\t"\
286         "psrad                       $16, %%mm4         \n\t"\
287         "psrad                       $16, %%mm5         \n\t"\
288         "psrad                       $16, %%mm6         \n\t"\
289         "psrad                       $16, %%mm7         \n\t"\
290         "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
291         "packssdw                  %%mm5, %%mm4         \n\t"\
292         "packssdw                  %%mm7, %%mm6         \n\t"\
293         "paddw                     %%mm0, %%mm4         \n\t"\
294         "paddw                     %%mm0, %%mm6         \n\t"\
295         "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
296         "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
297
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299     "lea                "offset"(%0), %%"REG_d"     \n\t"\
300     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
301     "pxor                      %%mm1, %%mm1         \n\t"\
302     "pxor                      %%mm5, %%mm5         \n\t"\
303     "pxor                      %%mm7, %%mm7         \n\t"\
304     "pxor                      %%mm6, %%mm6         \n\t"\
305     ASMALIGN(4)\
306     "2:                                             \n\t"\
307     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
308     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
309     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
310     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
311     "movq                      %%mm0, %%mm3         \n\t"\
312     "punpcklwd                 %%mm4, %%mm0         \n\t"\
313     "punpckhwd                 %%mm4, %%mm3         \n\t"\
314     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
315     "pmaddwd                   %%mm4, %%mm0         \n\t"\
316     "pmaddwd                   %%mm4, %%mm3         \n\t"\
317     "paddd                     %%mm0, %%mm1         \n\t"\
318     "paddd                     %%mm3, %%mm5         \n\t"\
319     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
320     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
321     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
322     "test                  %%"REG_S", %%"REG_S"     \n\t"\
323     "movq                      %%mm2, %%mm0         \n\t"\
324     "punpcklwd                 %%mm3, %%mm2         \n\t"\
325     "punpckhwd                 %%mm3, %%mm0         \n\t"\
326     "pmaddwd                   %%mm4, %%mm2         \n\t"\
327     "pmaddwd                   %%mm4, %%mm0         \n\t"\
328     "paddd                     %%mm2, %%mm7         \n\t"\
329     "paddd                     %%mm0, %%mm6         \n\t"\
330     " jnz                         2b                \n\t"\
331     "psrad                       $16, %%mm1         \n\t"\
332     "psrad                       $16, %%mm5         \n\t"\
333     "psrad                       $16, %%mm7         \n\t"\
334     "psrad                       $16, %%mm6         \n\t"\
335     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
336     "packssdw                  %%mm5, %%mm1         \n\t"\
337     "packssdw                  %%mm6, %%mm7         \n\t"\
338     "paddw                     %%mm0, %%mm1         \n\t"\
339     "paddw                     %%mm0, %%mm7         \n\t"\
340     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
341     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
342
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344     YSCALEYUV2PACKEDX_ACCURATE_UV \
345     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346
347 #define YSCALEYUV2RGBX \
348     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
349     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
350     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
351     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
352     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
353     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
354     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
356     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
357     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
358     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
359     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
360     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
361     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362     "paddw           %%mm3, %%mm4       \n\t"\
363     "movq            %%mm2, %%mm0       \n\t"\
364     "movq            %%mm5, %%mm6       \n\t"\
365     "movq            %%mm4, %%mm3       \n\t"\
366     "punpcklwd       %%mm2, %%mm2       \n\t"\
367     "punpcklwd       %%mm5, %%mm5       \n\t"\
368     "punpcklwd       %%mm4, %%mm4       \n\t"\
369     "paddw           %%mm1, %%mm2       \n\t"\
370     "paddw           %%mm1, %%mm5       \n\t"\
371     "paddw           %%mm1, %%mm4       \n\t"\
372     "punpckhwd       %%mm0, %%mm0       \n\t"\
373     "punpckhwd       %%mm6, %%mm6       \n\t"\
374     "punpckhwd       %%mm3, %%mm3       \n\t"\
375     "paddw           %%mm7, %%mm0       \n\t"\
376     "paddw           %%mm7, %%mm6       \n\t"\
377     "paddw           %%mm7, %%mm3       \n\t"\
378     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379     "packuswb        %%mm0, %%mm2       \n\t"\
380     "packuswb        %%mm6, %%mm5       \n\t"\
381     "packuswb        %%mm3, %%mm4       \n\t"\
382
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
385     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
386     "psraw                $3, %%mm0                           \n\t"\
387     "psraw                $3, %%mm1                           \n\t"\
388     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390     "xor            "#index", "#index"                        \n\t"\
391     ASMALIGN(4)\
392     "1:                                 \n\t"\
393     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
394     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
395     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
396     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
397     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
400     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
407     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
408     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
409     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
410     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
411     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
412     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
418
419 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
420
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422     "xor            "#index", "#index"  \n\t"\
423     ASMALIGN(4)\
424     "1:                                 \n\t"\
425     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
428     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
429     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
432     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
439     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
440     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
441     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
442     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
443     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
444     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
445
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447     "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
448     "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
449     "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
450     "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
451     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
452     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
453     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
462     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
463     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
464     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
465     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
466     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
467     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468     "paddw             %%mm3, %%mm4     \n\t"\
469     "movq              %%mm2, %%mm0     \n\t"\
470     "movq              %%mm5, %%mm6     \n\t"\
471     "movq              %%mm4, %%mm3     \n\t"\
472     "punpcklwd         %%mm2, %%mm2     \n\t"\
473     "punpcklwd         %%mm5, %%mm5     \n\t"\
474     "punpcklwd         %%mm4, %%mm4     \n\t"\
475     "paddw             %%mm1, %%mm2     \n\t"\
476     "paddw             %%mm1, %%mm5     \n\t"\
477     "paddw             %%mm1, %%mm4     \n\t"\
478     "punpckhwd         %%mm0, %%mm0     \n\t"\
479     "punpckhwd         %%mm6, %%mm6     \n\t"\
480     "punpckhwd         %%mm3, %%mm3     \n\t"\
481     "paddw             %%mm7, %%mm0     \n\t"\
482     "paddw             %%mm7, %%mm6     \n\t"\
483     "paddw             %%mm7, %%mm3     \n\t"\
484     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485     "packuswb          %%mm0, %%mm2     \n\t"\
486     "packuswb          %%mm6, %%mm5     \n\t"\
487     "packuswb          %%mm3, %%mm4     \n\t"\
488
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
490
491 #define YSCALEYUV2RGB(index, c) \
492     REAL_YSCALEYUV2RGB_UV(index, c) \
493     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494     REAL_YSCALEYUV2RGB_COEFF(c)
495
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497     "xor            "#index", "#index"  \n\t"\
498     ASMALIGN(4)\
499     "1:                                 \n\t"\
500     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
501     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
502     "psraw                $7, %%mm3     \n\t" \
503     "psraw                $7, %%mm4     \n\t" \
504     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
505     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
506     "psraw                $7, %%mm1     \n\t" \
507     "psraw                $7, %%mm7     \n\t" \
508
509 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
510
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512     "xor            "#index", "#index"  \n\t"\
513     ASMALIGN(4)\
514     "1:                                 \n\t"\
515     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
516     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
517     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
520     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
521     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
522     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
523     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
524     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
525     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
527     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
528     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
531     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
532     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
533     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
534     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
535     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
536     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537     "paddw             %%mm3, %%mm4     \n\t"\
538     "movq              %%mm2, %%mm0     \n\t"\
539     "movq              %%mm5, %%mm6     \n\t"\
540     "movq              %%mm4, %%mm3     \n\t"\
541     "punpcklwd         %%mm2, %%mm2     \n\t"\
542     "punpcklwd         %%mm5, %%mm5     \n\t"\
543     "punpcklwd         %%mm4, %%mm4     \n\t"\
544     "paddw             %%mm1, %%mm2     \n\t"\
545     "paddw             %%mm1, %%mm5     \n\t"\
546     "paddw             %%mm1, %%mm4     \n\t"\
547     "punpckhwd         %%mm0, %%mm0     \n\t"\
548     "punpckhwd         %%mm6, %%mm6     \n\t"\
549     "punpckhwd         %%mm3, %%mm3     \n\t"\
550     "paddw             %%mm7, %%mm0     \n\t"\
551     "paddw             %%mm7, %%mm6     \n\t"\
552     "paddw             %%mm7, %%mm3     \n\t"\
553     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554     "packuswb          %%mm0, %%mm2     \n\t"\
555     "packuswb          %%mm6, %%mm5     \n\t"\
556     "packuswb          %%mm3, %%mm4     \n\t"\
557
558 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
559
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561     "xor "#index", "#index"             \n\t"\
562     ASMALIGN(4)\
563     "1:                                 \n\t"\
564     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
565     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
566     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
567     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
568     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570     "psrlw                $8, %%mm3     \n\t" \
571     "psrlw                $8, %%mm4     \n\t" \
572     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
573     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
574     "psraw                $7, %%mm1     \n\t" \
575     "psraw                $7, %%mm7     \n\t"
576 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
577
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580     "xor            "#index", "#index"  \n\t"\
581     ASMALIGN(4)\
582     "1:                                 \n\t"\
583     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
584     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
585     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
586     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
587     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
590     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
591     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
592     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
593     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
594     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
595     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
596     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
597     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
599     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
600     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
603     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
604     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
605     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
606     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
607     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
608     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609     "paddw             %%mm3, %%mm4     \n\t"\
610     "movq              %%mm2, %%mm0     \n\t"\
611     "movq              %%mm5, %%mm6     \n\t"\
612     "movq              %%mm4, %%mm3     \n\t"\
613     "punpcklwd         %%mm2, %%mm2     \n\t"\
614     "punpcklwd         %%mm5, %%mm5     \n\t"\
615     "punpcklwd         %%mm4, %%mm4     \n\t"\
616     "paddw             %%mm1, %%mm2     \n\t"\
617     "paddw             %%mm1, %%mm5     \n\t"\
618     "paddw             %%mm1, %%mm4     \n\t"\
619     "punpckhwd         %%mm0, %%mm0     \n\t"\
620     "punpckhwd         %%mm6, %%mm6     \n\t"\
621     "punpckhwd         %%mm3, %%mm3     \n\t"\
622     "paddw             %%mm7, %%mm0     \n\t"\
623     "paddw             %%mm7, %%mm6     \n\t"\
624     "paddw             %%mm7, %%mm3     \n\t"\
625     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626     "packuswb          %%mm0, %%mm2     \n\t"\
627     "packuswb          %%mm6, %%mm5     \n\t"\
628     "packuswb          %%mm3, %%mm4     \n\t"\
629
630 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
631
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633     "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
634     "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
635     "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
636     "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
637     "packuswb          %%mm1, %%mm7     \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641     "movq       "#b", "#q2"     \n\t" /* B */\
642     "movq       "#r", "#t"      \n\t" /* R */\
643     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
644     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
645     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
646     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
647     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
648     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
649     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
650     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
651     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
652     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
653 \
654     MOVNTQ(   q0,   (dst, index, 4))\
655     MOVNTQ(    b,  8(dst, index, 4))\
656     MOVNTQ(   q2, 16(dst, index, 4))\
657     MOVNTQ(   q3, 24(dst, index, 4))\
658 \
659     "add      $8, "#index"      \n\t"\
660     "cmp "#dstw", "#index"      \n\t"\
661     " jb      1b                \n\t"
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
663
664 #define REAL_WRITERGB16(dst, dstw, index) \
665     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
666     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
667     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
668     "psrlq           $3, %%mm2  \n\t"\
669 \
670     "movq         %%mm2, %%mm1  \n\t"\
671     "movq         %%mm4, %%mm3  \n\t"\
672 \
673     "punpcklbw    %%mm7, %%mm3  \n\t"\
674     "punpcklbw    %%mm5, %%mm2  \n\t"\
675     "punpckhbw    %%mm7, %%mm4  \n\t"\
676     "punpckhbw    %%mm5, %%mm1  \n\t"\
677 \
678     "psllq           $3, %%mm3  \n\t"\
679     "psllq           $3, %%mm4  \n\t"\
680 \
681     "por          %%mm3, %%mm2  \n\t"\
682     "por          %%mm4, %%mm1  \n\t"\
683 \
684     MOVNTQ(%%mm2,  (dst, index, 2))\
685     MOVNTQ(%%mm1, 8(dst, index, 2))\
686 \
687     "add             $8, "#index"   \n\t"\
688     "cmp        "#dstw", "#index"   \n\t"\
689     " jb             1b             \n\t"
690 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
691
692 #define REAL_WRITERGB15(dst, dstw, index) \
693     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
694     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
695     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
696     "psrlq           $3, %%mm2  \n\t"\
697     "psrlq           $1, %%mm5  \n\t"\
698 \
699     "movq         %%mm2, %%mm1  \n\t"\
700     "movq         %%mm4, %%mm3  \n\t"\
701 \
702     "punpcklbw    %%mm7, %%mm3  \n\t"\
703     "punpcklbw    %%mm5, %%mm2  \n\t"\
704     "punpckhbw    %%mm7, %%mm4  \n\t"\
705     "punpckhbw    %%mm5, %%mm1  \n\t"\
706 \
707     "psllq           $2, %%mm3  \n\t"\
708     "psllq           $2, %%mm4  \n\t"\
709 \
710     "por          %%mm3, %%mm2  \n\t"\
711     "por          %%mm4, %%mm1  \n\t"\
712 \
713     MOVNTQ(%%mm2,  (dst, index, 2))\
714     MOVNTQ(%%mm1, 8(dst, index, 2))\
715 \
716     "add             $8, "#index"   \n\t"\
717     "cmp        "#dstw", "#index"   \n\t"\
718     " jb             1b             \n\t"
719 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
720
721 #define WRITEBGR24OLD(dst, dstw, index) \
722     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723     "movq      %%mm2, %%mm1             \n\t" /* B */\
724     "movq      %%mm5, %%mm6             \n\t" /* R */\
725     "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
726     "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
727     "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
728     "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
729     "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
730     "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
731     "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
732     "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
733     "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
734     "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
735 \
736     "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
737     "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
738     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
739     "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
740     "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
741     "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
742     "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
743     "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
744 \
745     "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
746     "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
747     "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
748     "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
749     "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
750     "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
751     "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
752     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
753     "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
754     "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
755     "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
756     "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
757     "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
758 \
759     "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
760     "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
761     "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
762     "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
763     "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
764     "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
765     "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
766     "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
767 \
768     MOVNTQ(%%mm0,   (dst))\
769     MOVNTQ(%%mm2,  8(dst))\
770     MOVNTQ(%%mm3, 16(dst))\
771     "add         $24, "#dst"            \n\t"\
772 \
773     "add          $8, "#index"          \n\t"\
774     "cmp     "#dstw", "#index"          \n\t"\
775     " jb          1b                    \n\t"
776
777 #define WRITEBGR24MMX(dst, dstw, index) \
778     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779     "movq      %%mm2, %%mm1     \n\t" /* B */\
780     "movq      %%mm5, %%mm6     \n\t" /* R */\
781     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
782     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
783     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
784     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
785     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
786     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
787     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
788     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
789     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
790     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
791 \
792     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
793     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
794     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
795     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
796 \
797     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
798     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
799     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
800     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
801 \
802     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
803     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
804     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
805     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
806 \
807     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
808     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
809     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
810     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
811     MOVNTQ(%%mm0, (dst))\
812 \
813     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
814     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
815     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
816     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
817     MOVNTQ(%%mm6, 8(dst))\
818 \
819     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
820     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
821     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
822     MOVNTQ(%%mm5, 16(dst))\
823 \
824     "add         $24, "#dst"    \n\t"\
825 \
826     "add          $8, "#index"  \n\t"\
827     "cmp     "#dstw", "#index"  \n\t"\
828     " jb          1b            \n\t"
829
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
835     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
836     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
837 \
838     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
839     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
840     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
841 \
842     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
843     "por    %%mm1, %%mm6        \n\t"\
844     "por    %%mm3, %%mm6        \n\t"\
845     MOVNTQ(%%mm6, (dst))\
846 \
847     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
848     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
849     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
850     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
851 \
852     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
853     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
854     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
855 \
856     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
857     "por    %%mm3, %%mm6        \n\t"\
858     MOVNTQ(%%mm6, 8(dst))\
859 \
860     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
861     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
862     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
863 \
864     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
865     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
866     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
867 \
868     "por    %%mm1, %%mm3        \n\t"\
869     "por    %%mm3, %%mm6        \n\t"\
870     MOVNTQ(%%mm6, 16(dst))\
871 \
872     "add      $24, "#dst"       \n\t"\
873 \
874     "add       $8, "#index"     \n\t"\
875     "cmp  "#dstw", "#index"     \n\t"\
876     " jb       1b               \n\t"
877
878 #if COMPILE_TEMPLATE_MMX2
879 #undef WRITEBGR24
880 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
881 #else
882 #undef WRITEBGR24
883 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
884 #endif
885
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887     "packuswb  %%mm3, %%mm3     \n\t"\
888     "packuswb  %%mm4, %%mm4     \n\t"\
889     "packuswb  %%mm7, %%mm1     \n\t"\
890     "punpcklbw %%mm4, %%mm3     \n\t"\
891     "movq      %%mm1, %%mm7     \n\t"\
892     "punpcklbw %%mm3, %%mm1     \n\t"\
893     "punpckhbw %%mm3, %%mm7     \n\t"\
894 \
895     MOVNTQ(%%mm1, (dst, index, 2))\
896     MOVNTQ(%%mm7, 8(dst, index, 2))\
897 \
898     "add          $8, "#index"  \n\t"\
899     "cmp     "#dstw", "#index"  \n\t"\
900     " jb          1b            \n\t"
901 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
902
903
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
907 {
908 #if COMPILE_TEMPLATE_MMX
909     if(!(c->flags & SWS_BITEXACT)) {
910         if (c->flags & SWS_ACCURATE_RND) {
911             if (uDest) {
912                 YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913                 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914             }
915             if (CONFIG_SWSCALE_ALPHA && aDest) {
916                 YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917             }
918
919             YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920         } else {
921             if (uDest) {
922                 YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923                 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924             }
925             if (CONFIG_SWSCALE_ALPHA && aDest) {
926                 YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927             }
928
929             YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930         }
931         return;
932     }
933 #endif
934 #if COMPILE_TEMPLATE_ALTIVEC
935     yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936                           chrFilter, chrSrc, chrFilterSize,
937                           dest, uDest, vDest, dstW, chrDstW);
938 #else //COMPILE_TEMPLATE_ALTIVEC
939     yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940                 chrFilter, chrSrc, chrFilterSize,
941                 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!COMPILE_TEMPLATE_ALTIVEC
943 }
944
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946                                      const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
948 {
949     yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950                  chrFilter, chrSrc, chrFilterSize,
951                  dest, uDest, dstW, chrDstW, dstFormat);
952 }
953
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
956 {
957     int i;
958 #if COMPILE_TEMPLATE_MMX
959     if(!(c->flags & SWS_BITEXACT)) {
960         long p= 4;
961         uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962         uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963         x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
964
965         if (c->flags & SWS_ACCURATE_RND) {
966             while(p--) {
967                 if (dst[p]) {
968                     __asm__ volatile(
969                         YSCALEYUV2YV121_ACCURATE
970                         :: "r" (src[p]), "r" (dst[p] + counter[p]),
971                         "g" (-counter[p])
972                         : "%"REG_a
973                     );
974                 }
975             }
976         } else {
977             while(p--) {
978                 if (dst[p]) {
979                     __asm__ volatile(
980                         YSCALEYUV2YV121
981                         :: "r" (src[p]), "r" (dst[p] + counter[p]),
982                         "g" (-counter[p])
983                         : "%"REG_a
984                     );
985                 }
986             }
987         }
988         return;
989     }
990 #endif
991     for (i=0; i<dstW; i++) {
992         int val= (lumSrc[i]+64)>>7;
993
994         if (val&256) {
995             if (val<0) val=0;
996             else       val=255;
997         }
998
999         dest[i]= val;
1000     }
1001
1002     if (uDest)
1003         for (i=0; i<chrDstW; i++) {
1004             int u=(chrSrc[i       ]+64)>>7;
1005             int v=(chrSrc[i + VOFW]+64)>>7;
1006
1007             if ((u|v)&256) {
1008                 if (u<0)        u=0;
1009                 else if (u>255) u=255;
1010                 if (v<0)        v=0;
1011                 else if (v>255) v=255;
1012             }
1013
1014             uDest[i]= u;
1015             vDest[i]= v;
1016         }
1017
1018     if (CONFIG_SWSCALE_ALPHA && aDest)
1019         for (i=0; i<dstW; i++) {
1020             int val= (alpSrc[i]+64)>>7;
1021             aDest[i]= av_clip_uint8(val);
1022         }
1023 }
1024
1025
1026 /**
1027  * vertical scale YV12 to RGB
1028  */
1029 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1030                                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1031                                        const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1032 {
1033 #if COMPILE_TEMPLATE_MMX
1034     x86_reg dummy=0;
1035     if(!(c->flags & SWS_BITEXACT)) {
1036         if (c->flags & SWS_ACCURATE_RND) {
1037             switch(c->dstFormat) {
1038             case PIX_FMT_RGB32:
1039                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1040                     YSCALEYUV2PACKEDX_ACCURATE
1041                     YSCALEYUV2RGBX
1042                     "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1043                     "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1044                     "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1045                     YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1046                     "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1047                     "psraw                        $3, %%mm1         \n\t"
1048                     "psraw                        $3, %%mm7         \n\t"
1049                     "packuswb                  %%mm7, %%mm1         \n\t"
1050                     WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1051
1052                     YSCALEYUV2PACKEDX_END
1053                 } else {
1054                     YSCALEYUV2PACKEDX_ACCURATE
1055                     YSCALEYUV2RGBX
1056                     "pcmpeqd %%mm7, %%mm7 \n\t"
1057                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1058
1059                     YSCALEYUV2PACKEDX_END
1060                 }
1061                 return;
1062             case PIX_FMT_BGR24:
1063                 YSCALEYUV2PACKEDX_ACCURATE
1064                 YSCALEYUV2RGBX
1065                 "pxor %%mm7, %%mm7 \n\t"
1066                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1067                 "add %4, %%"REG_c"                        \n\t"
1068                 WRITEBGR24(%%REGc, %5, %%REGa)
1069
1070
1071                 :: "r" (&c->redDither),
1072                 "m" (dummy), "m" (dummy), "m" (dummy),
1073                 "r" (dest), "m" (dstW)
1074                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1075                 );
1076                 return;
1077             case PIX_FMT_RGB555:
1078                 YSCALEYUV2PACKEDX_ACCURATE
1079                 YSCALEYUV2RGBX
1080                 "pxor %%mm7, %%mm7 \n\t"
1081                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082 #ifdef DITHER1XBPP
1083                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1084                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1085                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1086 #endif
1087
1088                 WRITERGB15(%4, %5, %%REGa)
1089                 YSCALEYUV2PACKEDX_END
1090                 return;
1091             case PIX_FMT_RGB565:
1092                 YSCALEYUV2PACKEDX_ACCURATE
1093                 YSCALEYUV2RGBX
1094                 "pxor %%mm7, %%mm7 \n\t"
1095                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1096 #ifdef DITHER1XBPP
1097                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1098                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1099                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1100 #endif
1101
1102                 WRITERGB16(%4, %5, %%REGa)
1103                 YSCALEYUV2PACKEDX_END
1104                 return;
1105             case PIX_FMT_YUYV422:
1106                 YSCALEYUV2PACKEDX_ACCURATE
1107                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1108
1109                 "psraw $3, %%mm3    \n\t"
1110                 "psraw $3, %%mm4    \n\t"
1111                 "psraw $3, %%mm1    \n\t"
1112                 "psraw $3, %%mm7    \n\t"
1113                 WRITEYUY2(%4, %5, %%REGa)
1114                 YSCALEYUV2PACKEDX_END
1115                 return;
1116             }
1117         } else {
1118             switch(c->dstFormat) {
1119             case PIX_FMT_RGB32:
1120                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1121                     YSCALEYUV2PACKEDX
1122                     YSCALEYUV2RGBX
1123                     YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1124                     "psraw                        $3, %%mm1         \n\t"
1125                     "psraw                        $3, %%mm7         \n\t"
1126                     "packuswb                  %%mm7, %%mm1         \n\t"
1127                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1128                     YSCALEYUV2PACKEDX_END
1129                 } else {
1130                     YSCALEYUV2PACKEDX
1131                     YSCALEYUV2RGBX
1132                     "pcmpeqd %%mm7, %%mm7 \n\t"
1133                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1134                     YSCALEYUV2PACKEDX_END
1135                 }
1136                 return;
1137             case PIX_FMT_BGR24:
1138                 YSCALEYUV2PACKEDX
1139                 YSCALEYUV2RGBX
1140                 "pxor                    %%mm7, %%mm7       \n\t"
1141                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1142                 "add                        %4, %%"REG_c"   \n\t"
1143                 WRITEBGR24(%%REGc, %5, %%REGa)
1144
1145                 :: "r" (&c->redDither),
1146                 "m" (dummy), "m" (dummy), "m" (dummy),
1147                 "r" (dest),  "m" (dstW)
1148                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1149                 );
1150                 return;
1151             case PIX_FMT_RGB555:
1152                 YSCALEYUV2PACKEDX
1153                 YSCALEYUV2RGBX
1154                 "pxor %%mm7, %%mm7 \n\t"
1155                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1156 #ifdef DITHER1XBPP
1157                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1158                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1159                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1160 #endif
1161
1162                 WRITERGB15(%4, %5, %%REGa)
1163                 YSCALEYUV2PACKEDX_END
1164                 return;
1165             case PIX_FMT_RGB565:
1166                 YSCALEYUV2PACKEDX
1167                 YSCALEYUV2RGBX
1168                 "pxor %%mm7, %%mm7 \n\t"
1169                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1170 #ifdef DITHER1XBPP
1171                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1172                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1173                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1174 #endif
1175
1176                 WRITERGB16(%4, %5, %%REGa)
1177                 YSCALEYUV2PACKEDX_END
1178                 return;
1179             case PIX_FMT_YUYV422:
1180                 YSCALEYUV2PACKEDX
1181                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1182
1183                 "psraw $3, %%mm3    \n\t"
1184                 "psraw $3, %%mm4    \n\t"
1185                 "psraw $3, %%mm1    \n\t"
1186                 "psraw $3, %%mm7    \n\t"
1187                 WRITEYUY2(%4, %5, %%REGa)
1188                 YSCALEYUV2PACKEDX_END
1189                 return;
1190             }
1191         }
1192     }
1193 #endif /* COMPILE_TEMPLATE_MMX */
1194 #if COMPILE_TEMPLATE_ALTIVEC
1195     /* The following list of supported dstFormat values should
1196        match what's found in the body of ff_yuv2packedX_altivec() */
1197     if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1198          (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1199           c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1200           c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1201             ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1202                                    chrFilter, chrSrc, chrFilterSize,
1203                                    dest, dstW, dstY);
1204     else
1205 #endif
1206         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1207                        chrFilter, chrSrc, chrFilterSize,
1208                        alpSrc, dest, dstW, dstY);
1209 }
1210
1211 /**
1212  * vertical bilinear scale YV12 to RGB
1213  */
1214 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1215                           const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1216 {
1217     int  yalpha1=4095- yalpha;
1218     int uvalpha1=4095-uvalpha;
1219     int i;
1220
1221 #if COMPILE_TEMPLATE_MMX
1222     if(!(c->flags & SWS_BITEXACT)) {
1223         switch(c->dstFormat) {
1224         //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1225         case PIX_FMT_RGB32:
1226             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1227 #if ARCH_X86_64
1228                 __asm__ volatile(
1229                     YSCALEYUV2RGB(%%REGBP, %5)
1230                     YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1231                     "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1232                     "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1233                     "packuswb            %%mm7, %%mm1       \n\t"
1234                     WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1235
1236                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1237                     "a" (&c->redDither)
1238                     ,"r" (abuf0), "r" (abuf1)
1239                     : "%"REG_BP
1240                 );
1241 #else
1242                 *(uint16_t **)(&c->u_temp)=abuf0;
1243                 *(uint16_t **)(&c->v_temp)=abuf1;
1244                 __asm__ volatile(
1245                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1246                     "mov        %4, %%"REG_b"               \n\t"
1247                     "push %%"REG_BP"                        \n\t"
1248                     YSCALEYUV2RGB(%%REGBP, %5)
1249                     "push                   %0              \n\t"
1250                     "push                   %1              \n\t"
1251                     "mov          "U_TEMP"(%5), %0          \n\t"
1252                     "mov          "V_TEMP"(%5), %1          \n\t"
1253                     YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1254                     "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1255                     "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1256                     "packuswb            %%mm7, %%mm1       \n\t"
1257                     "pop                    %1              \n\t"
1258                     "pop                    %0              \n\t"
1259                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1260                     "pop %%"REG_BP"                         \n\t"
1261                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1262
1263                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1264                     "a" (&c->redDither)
1265                 );
1266 #endif
1267             } else {
1268                 __asm__ volatile(
1269                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1270                     "mov        %4, %%"REG_b"               \n\t"
1271                     "push %%"REG_BP"                        \n\t"
1272                     YSCALEYUV2RGB(%%REGBP, %5)
1273                     "pcmpeqd %%mm7, %%mm7                   \n\t"
1274                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1275                     "pop %%"REG_BP"                         \n\t"
1276                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1277
1278                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1279                     "a" (&c->redDither)
1280                 );
1281             }
1282             return;
1283         case PIX_FMT_BGR24:
1284             __asm__ volatile(
1285                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1286                 "mov        %4, %%"REG_b"               \n\t"
1287                 "push %%"REG_BP"                        \n\t"
1288                 YSCALEYUV2RGB(%%REGBP, %5)
1289                 "pxor    %%mm7, %%mm7                   \n\t"
1290                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1291                 "pop %%"REG_BP"                         \n\t"
1292                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1293                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1294                 "a" (&c->redDither)
1295             );
1296             return;
1297         case PIX_FMT_RGB555:
1298             __asm__ volatile(
1299                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1300                 "mov        %4, %%"REG_b"               \n\t"
1301                 "push %%"REG_BP"                        \n\t"
1302                 YSCALEYUV2RGB(%%REGBP, %5)
1303                 "pxor    %%mm7, %%mm7                   \n\t"
1304                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1305 #ifdef DITHER1XBPP
1306                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1307                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1308                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1309 #endif
1310
1311                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1312                 "pop %%"REG_BP"                         \n\t"
1313                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1314
1315                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1316                 "a" (&c->redDither)
1317             );
1318             return;
1319         case PIX_FMT_RGB565:
1320             __asm__ volatile(
1321                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1322                 "mov        %4, %%"REG_b"               \n\t"
1323                 "push %%"REG_BP"                        \n\t"
1324                 YSCALEYUV2RGB(%%REGBP, %5)
1325                 "pxor    %%mm7, %%mm7                   \n\t"
1326                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1327 #ifdef DITHER1XBPP
1328                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1329                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1330                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1331 #endif
1332
1333                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1334                 "pop %%"REG_BP"                         \n\t"
1335                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1336                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1337                 "a" (&c->redDither)
1338             );
1339             return;
1340         case PIX_FMT_YUYV422:
1341             __asm__ volatile(
1342                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1343                 "mov %4, %%"REG_b"                        \n\t"
1344                 "push %%"REG_BP"                        \n\t"
1345                 YSCALEYUV2PACKED(%%REGBP, %5)
1346                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1347                 "pop %%"REG_BP"                         \n\t"
1348                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1349                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1350                 "a" (&c->redDither)
1351             );
1352             return;
1353         default: break;
1354         }
1355     }
1356 #endif //COMPILE_TEMPLATE_MMX
1357     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1358 }
1359
1360 /**
1361  * YV12 to RGB without scaling or interpolating
1362  */
1363 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1364                           const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1365 {
1366     const int yalpha1=0;
1367     int i;
1368
1369     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1370     const int yalpha= 4096; //FIXME ...
1371
1372     if (flags&SWS_FULL_CHR_H_INT) {
1373         c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1374         return;
1375     }
1376
1377 #if COMPILE_TEMPLATE_MMX
1378     if(!(flags & SWS_BITEXACT)) {
1379         if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1380             switch(dstFormat) {
1381             case PIX_FMT_RGB32:
1382                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1383                     __asm__ volatile(
1384                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1385                         "mov        %4, %%"REG_b"               \n\t"
1386                         "push %%"REG_BP"                        \n\t"
1387                         YSCALEYUV2RGB1(%%REGBP, %5)
1388                         YSCALEYUV2RGB1_ALPHA(%%REGBP)
1389                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1390                         "pop %%"REG_BP"                         \n\t"
1391                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1392
1393                         :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1394                         "a" (&c->redDither)
1395                     );
1396                 } else {
1397                     __asm__ volatile(
1398                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1399                         "mov        %4, %%"REG_b"               \n\t"
1400                         "push %%"REG_BP"                        \n\t"
1401                         YSCALEYUV2RGB1(%%REGBP, %5)
1402                         "pcmpeqd %%mm7, %%mm7                   \n\t"
1403                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1404                         "pop %%"REG_BP"                         \n\t"
1405                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1406
1407                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1408                         "a" (&c->redDither)
1409                     );
1410                 }
1411                 return;
1412             case PIX_FMT_BGR24:
1413                 __asm__ volatile(
1414                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1415                     "mov        %4, %%"REG_b"               \n\t"
1416                     "push %%"REG_BP"                        \n\t"
1417                     YSCALEYUV2RGB1(%%REGBP, %5)
1418                     "pxor    %%mm7, %%mm7                   \n\t"
1419                     WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1420                     "pop %%"REG_BP"                         \n\t"
1421                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1422
1423                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1424                     "a" (&c->redDither)
1425                 );
1426                 return;
1427             case PIX_FMT_RGB555:
1428                 __asm__ volatile(
1429                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1430                     "mov        %4, %%"REG_b"               \n\t"
1431                     "push %%"REG_BP"                        \n\t"
1432                     YSCALEYUV2RGB1(%%REGBP, %5)
1433                     "pxor    %%mm7, %%mm7                   \n\t"
1434                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1435 #ifdef DITHER1XBPP
1436                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1437                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1438                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1439 #endif
1440                     WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1441                     "pop %%"REG_BP"                         \n\t"
1442                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1443
1444                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1445                     "a" (&c->redDither)
1446                 );
1447                 return;
1448             case PIX_FMT_RGB565:
1449                 __asm__ volatile(
1450                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1451                     "mov        %4, %%"REG_b"               \n\t"
1452                     "push %%"REG_BP"                        \n\t"
1453                     YSCALEYUV2RGB1(%%REGBP, %5)
1454                     "pxor    %%mm7, %%mm7                   \n\t"
1455                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1456 #ifdef DITHER1XBPP
1457                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1458                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1459                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1460 #endif
1461
1462                     WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1463                     "pop %%"REG_BP"                         \n\t"
1464                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1465
1466                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1467                     "a" (&c->redDither)
1468                 );
1469                 return;
1470             case PIX_FMT_YUYV422:
1471                 __asm__ volatile(
1472                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1473                     "mov        %4, %%"REG_b"               \n\t"
1474                     "push %%"REG_BP"                        \n\t"
1475                     YSCALEYUV2PACKED1(%%REGBP, %5)
1476                     WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1477                     "pop %%"REG_BP"                         \n\t"
1478                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1479
1480                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481                     "a" (&c->redDither)
1482                 );
1483                 return;
1484             }
1485         } else {
1486             switch(dstFormat) {
1487             case PIX_FMT_RGB32:
1488                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1489                     __asm__ volatile(
1490                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1491                         "mov        %4, %%"REG_b"               \n\t"
1492                         "push %%"REG_BP"                        \n\t"
1493                         YSCALEYUV2RGB1b(%%REGBP, %5)
1494                         YSCALEYUV2RGB1_ALPHA(%%REGBP)
1495                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1496                         "pop %%"REG_BP"                         \n\t"
1497                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1498
1499                         :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1500                         "a" (&c->redDither)
1501                     );
1502                 } else {
1503                     __asm__ volatile(
1504                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1505                         "mov        %4, %%"REG_b"               \n\t"
1506                         "push %%"REG_BP"                        \n\t"
1507                         YSCALEYUV2RGB1b(%%REGBP, %5)
1508                         "pcmpeqd %%mm7, %%mm7                   \n\t"
1509                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1510                         "pop %%"REG_BP"                         \n\t"
1511                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1512
1513                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514                         "a" (&c->redDither)
1515                     );
1516                 }
1517                 return;
1518             case PIX_FMT_BGR24:
1519                 __asm__ volatile(
1520                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1521                     "mov        %4, %%"REG_b"               \n\t"
1522                     "push %%"REG_BP"                        \n\t"
1523                     YSCALEYUV2RGB1b(%%REGBP, %5)
1524                     "pxor    %%mm7, %%mm7                   \n\t"
1525                     WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1526                     "pop %%"REG_BP"                         \n\t"
1527                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1528
1529                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1530                     "a" (&c->redDither)
1531                 );
1532                 return;
1533             case PIX_FMT_RGB555:
1534                 __asm__ volatile(
1535                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1536                     "mov        %4, %%"REG_b"               \n\t"
1537                     "push %%"REG_BP"                        \n\t"
1538                     YSCALEYUV2RGB1b(%%REGBP, %5)
1539                     "pxor    %%mm7, %%mm7                   \n\t"
1540                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1541 #ifdef DITHER1XBPP
1542                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1543                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1544                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1545 #endif
1546                     WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1547                     "pop %%"REG_BP"                         \n\t"
1548                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1549
1550                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551                     "a" (&c->redDither)
1552                 );
1553                 return;
1554             case PIX_FMT_RGB565:
1555                 __asm__ volatile(
1556                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1557                     "mov        %4, %%"REG_b"               \n\t"
1558                     "push %%"REG_BP"                        \n\t"
1559                     YSCALEYUV2RGB1b(%%REGBP, %5)
1560                     "pxor    %%mm7, %%mm7                   \n\t"
1561                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1562 #ifdef DITHER1XBPP
1563                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1564                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1565                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1566 #endif
1567
1568                     WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1569                     "pop %%"REG_BP"                         \n\t"
1570                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1571
1572                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573                     "a" (&c->redDither)
1574                 );
1575                 return;
1576             case PIX_FMT_YUYV422:
1577                 __asm__ volatile(
1578                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1579                     "mov        %4, %%"REG_b"               \n\t"
1580                     "push %%"REG_BP"                        \n\t"
1581                     YSCALEYUV2PACKED1b(%%REGBP, %5)
1582                     WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1583                     "pop %%"REG_BP"                         \n\t"
1584                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1585
1586                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1587                     "a" (&c->redDither)
1588                 );
1589                 return;
1590             }
1591         }
1592     }
1593 #endif /* COMPILE_TEMPLATE_MMX */
1594     if (uvalpha < 2048) {
1595         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1596     } else {
1597         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1598     }
1599 }
1600
1601 //FIXME yuy2* can read up to 7 samples too much
1602
1603 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1604 {
1605 #if COMPILE_TEMPLATE_MMX
1606     __asm__ volatile(
1607         "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1608         "mov                    %0, %%"REG_a"       \n\t"
1609         "1:                                         \n\t"
1610         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1611         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1612         "pand                %%mm2, %%mm0           \n\t"
1613         "pand                %%mm2, %%mm1           \n\t"
1614         "packuswb            %%mm1, %%mm0           \n\t"
1615         "movq                %%mm0, (%2, %%"REG_a") \n\t"
1616         "add                    $8, %%"REG_a"       \n\t"
1617         " js                    1b                  \n\t"
1618         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1619         : "%"REG_a
1620     );
1621 #else
1622     int i;
1623     for (i=0; i<width; i++)
1624         dst[i]= src[2*i];
1625 #endif
1626 }
1627
1628 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1629 {
1630 #if COMPILE_TEMPLATE_MMX
1631     __asm__ volatile(
1632         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1633         "mov                    %0, %%"REG_a"       \n\t"
1634         "1:                                         \n\t"
1635         "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1636         "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1637         "psrlw                  $8, %%mm0           \n\t"
1638         "psrlw                  $8, %%mm1           \n\t"
1639         "packuswb            %%mm1, %%mm0           \n\t"
1640         "movq                %%mm0, %%mm1           \n\t"
1641         "psrlw                  $8, %%mm0           \n\t"
1642         "pand                %%mm4, %%mm1           \n\t"
1643         "packuswb            %%mm0, %%mm0           \n\t"
1644         "packuswb            %%mm1, %%mm1           \n\t"
1645         "movd                %%mm0, (%3, %%"REG_a") \n\t"
1646         "movd                %%mm1, (%2, %%"REG_a") \n\t"
1647         "add                    $4, %%"REG_a"       \n\t"
1648         " js                    1b                  \n\t"
1649         : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1650         : "%"REG_a
1651     );
1652 #else
1653     int i;
1654     for (i=0; i<width; i++) {
1655         dstU[i]= src1[4*i + 1];
1656         dstV[i]= src1[4*i + 3];
1657     }
1658 #endif
1659     assert(src1 == src2);
1660 }
1661
1662 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1663 {
1664 #if COMPILE_TEMPLATE_MMX
1665     __asm__ volatile(
1666         "mov                    %0, %%"REG_a"       \n\t"
1667         "1:                                         \n\t"
1668         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1669         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1670         "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1671         "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1672         "psrlw                  $8, %%mm0           \n\t"
1673         "psrlw                  $8, %%mm1           \n\t"
1674         "psrlw                  $8, %%mm2           \n\t"
1675         "psrlw                  $8, %%mm3           \n\t"
1676         "packuswb            %%mm1, %%mm0           \n\t"
1677         "packuswb            %%mm3, %%mm2           \n\t"
1678         "movq                %%mm0, (%3, %%"REG_a") \n\t"
1679         "movq                %%mm2, (%4, %%"REG_a") \n\t"
1680         "add                    $8, %%"REG_a"       \n\t"
1681         " js                    1b                  \n\t"
1682         : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1683         : "%"REG_a
1684     );
1685 #else
1686     int i;
1687     for (i=0; i<width; i++) {
1688         dstU[i]= src1[2*i + 1];
1689         dstV[i]= src2[2*i + 1];
1690     }
1691 #endif
1692 }
1693
1694 /* This is almost identical to the previous, end exists only because
1695  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1696 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1697 {
1698 #if COMPILE_TEMPLATE_MMX
1699     __asm__ volatile(
1700         "mov                  %0, %%"REG_a"         \n\t"
1701         "1:                                         \n\t"
1702         "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1703         "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1704         "psrlw                $8, %%mm0             \n\t"
1705         "psrlw                $8, %%mm1             \n\t"
1706         "packuswb          %%mm1, %%mm0             \n\t"
1707         "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1708         "add                  $8, %%"REG_a"         \n\t"
1709         " js                  1b                    \n\t"
1710         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1711         : "%"REG_a
1712     );
1713 #else
1714     int i;
1715     for (i=0; i<width; i++)
1716         dst[i]= src[2*i+1];
1717 #endif
1718 }
1719
1720 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1721 {
1722 #if COMPILE_TEMPLATE_MMX
1723     __asm__ volatile(
1724         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1725         "mov                    %0, %%"REG_a"       \n\t"
1726         "1:                                         \n\t"
1727         "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1728         "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1729         "pand                %%mm4, %%mm0           \n\t"
1730         "pand                %%mm4, %%mm1           \n\t"
1731         "packuswb            %%mm1, %%mm0           \n\t"
1732         "movq                %%mm0, %%mm1           \n\t"
1733         "psrlw                  $8, %%mm0           \n\t"
1734         "pand                %%mm4, %%mm1           \n\t"
1735         "packuswb            %%mm0, %%mm0           \n\t"
1736         "packuswb            %%mm1, %%mm1           \n\t"
1737         "movd                %%mm0, (%3, %%"REG_a") \n\t"
1738         "movd                %%mm1, (%2, %%"REG_a") \n\t"
1739         "add                    $4, %%"REG_a"       \n\t"
1740         " js                    1b                  \n\t"
1741         : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1742         : "%"REG_a
1743     );
1744 #else
1745     int i;
1746     for (i=0; i<width; i++) {
1747         dstU[i]= src1[4*i + 0];
1748         dstV[i]= src1[4*i + 2];
1749     }
1750 #endif
1751     assert(src1 == src2);
1752 }
1753
1754 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1755 {
1756 #if COMPILE_TEMPLATE_MMX
1757     __asm__ volatile(
1758         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1759         "mov                    %0, %%"REG_a"       \n\t"
1760         "1:                                         \n\t"
1761         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1762         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1763         "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1764         "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1765         "pand                %%mm4, %%mm0           \n\t"
1766         "pand                %%mm4, %%mm1           \n\t"
1767         "pand                %%mm4, %%mm2           \n\t"
1768         "pand                %%mm4, %%mm3           \n\t"
1769         "packuswb            %%mm1, %%mm0           \n\t"
1770         "packuswb            %%mm3, %%mm2           \n\t"
1771         "movq                %%mm0, (%3, %%"REG_a") \n\t"
1772         "movq                %%mm2, (%4, %%"REG_a") \n\t"
1773         "add                    $8, %%"REG_a"       \n\t"
1774         " js                    1b                  \n\t"
1775         : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1776         : "%"REG_a
1777     );
1778 #else
1779     int i;
1780     for (i=0; i<width; i++) {
1781         dstU[i]= src1[2*i];
1782         dstV[i]= src2[2*i];
1783     }
1784 #endif
1785 }
1786
1787 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1788                                     const uint8_t *src, long width)
1789 {
1790 #if COMPILE_TEMPLATE_MMX
1791     __asm__ volatile(
1792         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1793         "mov                    %0, %%"REG_a"       \n\t"
1794         "1:                                         \n\t"
1795         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1796         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1797         "movq                %%mm0, %%mm2           \n\t"
1798         "movq                %%mm1, %%mm3           \n\t"
1799         "pand                %%mm4, %%mm0           \n\t"
1800         "pand                %%mm4, %%mm1           \n\t"
1801         "psrlw                  $8, %%mm2           \n\t"
1802         "psrlw                  $8, %%mm3           \n\t"
1803         "packuswb            %%mm1, %%mm0           \n\t"
1804         "packuswb            %%mm3, %%mm2           \n\t"
1805         "movq                %%mm0, (%2, %%"REG_a") \n\t"
1806         "movq                %%mm2, (%3, %%"REG_a") \n\t"
1807         "add                    $8, %%"REG_a"       \n\t"
1808         " js                    1b                  \n\t"
1809         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1810         : "%"REG_a
1811     );
1812 #else
1813     int i;
1814     for (i = 0; i < width; i++) {
1815         dst1[i] = src[2*i+0];
1816         dst2[i] = src[2*i+1];
1817     }
1818 #endif
1819 }
1820
1821 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1822                                     const uint8_t *src1, const uint8_t *src2,
1823                                     long width, uint32_t *unused)
1824 {
1825     RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1826 }
1827
1828 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1829                                     const uint8_t *src1, const uint8_t *src2,
1830                                     long width, uint32_t *unused)
1831 {
1832     RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1833 }
1834
1835 #if COMPILE_TEMPLATE_MMX
1836 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1837 {
1838
1839     if(srcFormat == PIX_FMT_BGR24) {
1840         __asm__ volatile(
1841             "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1842             "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1843             :
1844         );
1845     } else {
1846         __asm__ volatile(
1847             "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1848             "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1849             :
1850         );
1851     }
1852
1853     __asm__ volatile(
1854         "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1855         "mov                        %2, %%"REG_a"   \n\t"
1856         "pxor                    %%mm7, %%mm7       \n\t"
1857         "1:                                         \n\t"
1858         PREFETCH"               64(%0)              \n\t"
1859         "movd                     (%0), %%mm0       \n\t"
1860         "movd                    2(%0), %%mm1       \n\t"
1861         "movd                    6(%0), %%mm2       \n\t"
1862         "movd                    8(%0), %%mm3       \n\t"
1863         "add                       $12, %0          \n\t"
1864         "punpcklbw               %%mm7, %%mm0       \n\t"
1865         "punpcklbw               %%mm7, %%mm1       \n\t"
1866         "punpcklbw               %%mm7, %%mm2       \n\t"
1867         "punpcklbw               %%mm7, %%mm3       \n\t"
1868         "pmaddwd                 %%mm5, %%mm0       \n\t"
1869         "pmaddwd                 %%mm6, %%mm1       \n\t"
1870         "pmaddwd                 %%mm5, %%mm2       \n\t"
1871         "pmaddwd                 %%mm6, %%mm3       \n\t"
1872         "paddd                   %%mm1, %%mm0       \n\t"
1873         "paddd                   %%mm3, %%mm2       \n\t"
1874         "paddd                   %%mm4, %%mm0       \n\t"
1875         "paddd                   %%mm4, %%mm2       \n\t"
1876         "psrad                     $15, %%mm0       \n\t"
1877         "psrad                     $15, %%mm2       \n\t"
1878         "packssdw                %%mm2, %%mm0       \n\t"
1879         "packuswb                %%mm0, %%mm0       \n\t"
1880         "movd                %%mm0, (%1, %%"REG_a") \n\t"
1881         "add                        $4, %%"REG_a"   \n\t"
1882         " js                        1b              \n\t"
1883     : "+r" (src)
1884     : "r" (dst+width), "g" ((x86_reg)-width)
1885     : "%"REG_a
1886     );
1887 }
1888
1889 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1890 {
1891     __asm__ volatile(
1892         "movq                    24+%4, %%mm6       \n\t"
1893         "mov                        %3, %%"REG_a"   \n\t"
1894         "pxor                    %%mm7, %%mm7       \n\t"
1895         "1:                                         \n\t"
1896         PREFETCH"               64(%0)              \n\t"
1897         "movd                     (%0), %%mm0       \n\t"
1898         "movd                    2(%0), %%mm1       \n\t"
1899         "punpcklbw               %%mm7, %%mm0       \n\t"
1900         "punpcklbw               %%mm7, %%mm1       \n\t"
1901         "movq                    %%mm0, %%mm2       \n\t"
1902         "movq                    %%mm1, %%mm3       \n\t"
1903         "pmaddwd                    %4, %%mm0       \n\t"
1904         "pmaddwd                  8+%4, %%mm1       \n\t"
1905         "pmaddwd                 16+%4, %%mm2       \n\t"
1906         "pmaddwd                 %%mm6, %%mm3       \n\t"
1907         "paddd                   %%mm1, %%mm0       \n\t"
1908         "paddd                   %%mm3, %%mm2       \n\t"
1909
1910         "movd                    6(%0), %%mm1       \n\t"
1911         "movd                    8(%0), %%mm3       \n\t"
1912         "add                       $12, %0          \n\t"
1913         "punpcklbw               %%mm7, %%mm1       \n\t"
1914         "punpcklbw               %%mm7, %%mm3       \n\t"
1915         "movq                    %%mm1, %%mm4       \n\t"
1916         "movq                    %%mm3, %%mm5       \n\t"
1917         "pmaddwd                    %4, %%mm1       \n\t"
1918         "pmaddwd                  8+%4, %%mm3       \n\t"
1919         "pmaddwd                 16+%4, %%mm4       \n\t"
1920         "pmaddwd                 %%mm6, %%mm5       \n\t"
1921         "paddd                   %%mm3, %%mm1       \n\t"
1922         "paddd                   %%mm5, %%mm4       \n\t"
1923
1924         "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1925         "paddd                   %%mm3, %%mm0       \n\t"
1926         "paddd                   %%mm3, %%mm2       \n\t"
1927         "paddd                   %%mm3, %%mm1       \n\t"
1928         "paddd                   %%mm3, %%mm4       \n\t"
1929         "psrad                     $15, %%mm0       \n\t"
1930         "psrad                     $15, %%mm2       \n\t"
1931         "psrad                     $15, %%mm1       \n\t"
1932         "psrad                     $15, %%mm4       \n\t"
1933         "packssdw                %%mm1, %%mm0       \n\t"
1934         "packssdw                %%mm4, %%mm2       \n\t"
1935         "packuswb                %%mm0, %%mm0       \n\t"
1936         "packuswb                %%mm2, %%mm2       \n\t"
1937         "movd                %%mm0, (%1, %%"REG_a") \n\t"
1938         "movd                %%mm2, (%2, %%"REG_a") \n\t"
1939         "add                        $4, %%"REG_a"   \n\t"
1940         " js                        1b              \n\t"
1941     : "+r" (src)
1942     : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1943     : "%"REG_a
1944     );
1945 }
1946 #endif
1947
1948 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1949 {
1950 #if COMPILE_TEMPLATE_MMX
1951     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1952 #else
1953     int i;
1954     for (i=0; i<width; i++) {
1955         int b= src[i*3+0];
1956         int g= src[i*3+1];
1957         int r= src[i*3+2];
1958
1959         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1960     }
1961 #endif /* COMPILE_TEMPLATE_MMX */
1962 }
1963
1964 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1965 {
1966 #if COMPILE_TEMPLATE_MMX
1967     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1968 #else
1969     int i;
1970     for (i=0; i<width; i++) {
1971         int b= src1[3*i + 0];
1972         int g= src1[3*i + 1];
1973         int r= src1[3*i + 2];
1974
1975         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1976         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1977     }
1978 #endif /* COMPILE_TEMPLATE_MMX */
1979     assert(src1 == src2);
1980 }
1981
1982 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1983 {
1984     int i;
1985     for (i=0; i<width; i++) {
1986         int b= src1[6*i + 0] + src1[6*i + 3];
1987         int g= src1[6*i + 1] + src1[6*i + 4];
1988         int r= src1[6*i + 2] + src1[6*i + 5];
1989
1990         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1991         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1992     }
1993     assert(src1 == src2);
1994 }
1995
1996 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1997 {
1998 #if COMPILE_TEMPLATE_MMX
1999     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2000 #else
2001     int i;
2002     for (i=0; i<width; i++) {
2003         int r= src[i*3+0];
2004         int g= src[i*3+1];
2005         int b= src[i*3+2];
2006
2007         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2008     }
2009 #endif
2010 }
2011
2012 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2013 {
2014 #if COMPILE_TEMPLATE_MMX
2015     assert(src1==src2);
2016     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2017 #else
2018     int i;
2019     assert(src1==src2);
2020     for (i=0; i<width; i++) {
2021         int r= src1[3*i + 0];
2022         int g= src1[3*i + 1];
2023         int b= src1[3*i + 2];
2024
2025         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2026         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2027     }
2028 #endif
2029 }
2030
2031 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2032 {
2033     int i;
2034     assert(src1==src2);
2035     for (i=0; i<width; i++) {
2036         int r= src1[6*i + 0] + src1[6*i + 3];
2037         int g= src1[6*i + 1] + src1[6*i + 4];
2038         int b= src1[6*i + 2] + src1[6*i + 5];
2039
2040         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2041         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2042     }
2043 }
2044
2045
2046 // bilinear / bicubic scaling
2047 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2048                                   const int16_t *filter, const int16_t *filterPos, long filterSize)
2049 {
2050 #if COMPILE_TEMPLATE_MMX
2051     assert(filterSize % 4 == 0 && filterSize>0);
2052     if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2053         x86_reg counter= -2*dstW;
2054         filter-= counter*2;
2055         filterPos-= counter/2;
2056         dst-= counter/2;
2057         __asm__ volatile(
2058 #if defined(PIC)
2059             "push            %%"REG_b"              \n\t"
2060 #endif
2061             "pxor                %%mm7, %%mm7       \n\t"
2062             "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2063             "mov             %%"REG_a", %%"REG_BP"  \n\t"
2064             ASMALIGN(4)
2065             "1:                                     \n\t"
2066             "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2067             "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2068             "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2069             "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2070             "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2071             "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2072             "punpcklbw           %%mm7, %%mm0       \n\t"
2073             "punpcklbw           %%mm7, %%mm2       \n\t"
2074             "pmaddwd             %%mm1, %%mm0       \n\t"
2075             "pmaddwd             %%mm2, %%mm3       \n\t"
2076             "movq                %%mm0, %%mm4       \n\t"
2077             "punpckldq           %%mm3, %%mm0       \n\t"
2078             "punpckhdq           %%mm3, %%mm4       \n\t"
2079             "paddd               %%mm4, %%mm0       \n\t"
2080             "psrad                  $7, %%mm0       \n\t"
2081             "packssdw            %%mm0, %%mm0       \n\t"
2082             "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2083             "add                    $4, %%"REG_BP"  \n\t"
2084             " jnc                   1b              \n\t"
2085
2086             "pop            %%"REG_BP"              \n\t"
2087 #if defined(PIC)
2088             "pop             %%"REG_b"              \n\t"
2089 #endif
2090             : "+a" (counter)
2091             : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2092 #if !defined(PIC)
2093             : "%"REG_b
2094 #endif
2095         );
2096     } else if (filterSize==8) {
2097         x86_reg counter= -2*dstW;
2098         filter-= counter*4;
2099         filterPos-= counter/2;
2100         dst-= counter/2;
2101         __asm__ volatile(
2102 #if defined(PIC)
2103             "push             %%"REG_b"             \n\t"
2104 #endif
2105             "pxor                 %%mm7, %%mm7      \n\t"
2106             "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2107             "mov              %%"REG_a", %%"REG_BP" \n\t"
2108             ASMALIGN(4)
2109             "1:                                     \n\t"
2110             "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2111             "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2112             "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2113             "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2114             "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2115             "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2116             "punpcklbw            %%mm7, %%mm0      \n\t"
2117             "punpcklbw            %%mm7, %%mm2      \n\t"
2118             "pmaddwd              %%mm1, %%mm0      \n\t"
2119             "pmaddwd              %%mm2, %%mm3      \n\t"
2120
2121             "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2122             "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2123             "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2124             "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2125             "punpcklbw            %%mm7, %%mm4      \n\t"
2126             "punpcklbw            %%mm7, %%mm2      \n\t"
2127             "pmaddwd              %%mm1, %%mm4      \n\t"
2128             "pmaddwd              %%mm2, %%mm5      \n\t"
2129             "paddd                %%mm4, %%mm0      \n\t"
2130             "paddd                %%mm5, %%mm3      \n\t"
2131             "movq                 %%mm0, %%mm4      \n\t"
2132             "punpckldq            %%mm3, %%mm0      \n\t"
2133             "punpckhdq            %%mm3, %%mm4      \n\t"
2134             "paddd                %%mm4, %%mm0      \n\t"
2135             "psrad                   $7, %%mm0      \n\t"
2136             "packssdw             %%mm0, %%mm0      \n\t"
2137             "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2138             "add                     $4, %%"REG_BP" \n\t"
2139             " jnc                    1b             \n\t"
2140
2141             "pop             %%"REG_BP"             \n\t"