Update to MPlayer SVN rev 29473 and FFmpeg SVN rev 19572.
[vaapi:athaifas-mplayer.git] / libswscale / .svn / text-base / swscale_template.c.svn-base
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  *
20  * The C code (not assembly, MMX, ...) of this file can be used
21  * under the LGPL license.
22  */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29
30 #if COMPILE_TEMPLATE_AMD3DNOW
31 #define PREFETCH  "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif COMPILE_TEMPLATE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
36 #else
37 #define PREFETCH  " # nop"
38 #define PREFETCHW " # nop"
39 #endif
40
41 #if COMPILE_TEMPLATE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif COMPILE_TEMPLATE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45 #endif
46
47 #if COMPILE_TEMPLATE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
49 #else
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
51 #endif
52 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
53
54 #if COMPILE_TEMPLATE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
56 #endif
57
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
59     __asm__ volatile(\
60     "xor                          %%"REG_a", %%"REG_a"  \n\t"\
61     "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
62     "movq                             %%mm3, %%mm4      \n\t"\
63     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
64     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65     ASMALIGN(4) /* FIXME Unroll? */\
66     "1:                                                 \n\t"\
67     "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
68     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
69     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
70     "add                                $16, %%"REG_d"  \n\t"\
71     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
72     "test                         %%"REG_S", %%"REG_S"  \n\t"\
73     "pmulhw                           %%mm0, %%mm2      \n\t"\
74     "pmulhw                           %%mm0, %%mm5      \n\t"\
75     "paddw                            %%mm2, %%mm3      \n\t"\
76     "paddw                            %%mm5, %%mm4      \n\t"\
77     " jnz                                1b             \n\t"\
78     "psraw                               $3, %%mm3      \n\t"\
79     "psraw                               $3, %%mm4      \n\t"\
80     "packuswb                         %%mm4, %%mm3      \n\t"\
81     MOVNTQ(%%mm3, (%1, %%REGa))\
82     "add                                 $8, %%"REG_a"  \n\t"\
83     "cmp                                 %2, %%"REG_a"  \n\t"\
84     "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
85     "movq                             %%mm3, %%mm4      \n\t"\
86     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
87     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
88     "jb                                  1b             \n\t"\
89     :: "r" (&c->redDither),\
90     "r" (dest), "g" (width)\
91     : "%"REG_a, "%"REG_d, "%"REG_S\
92     );
93
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
95     __asm__ volatile(\
96     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
97     "xor                          %%"REG_a", %%"REG_a"  \n\t"\
98     "pxor                             %%mm4, %%mm4      \n\t"\
99     "pxor                             %%mm5, %%mm5      \n\t"\
100     "pxor                             %%mm6, %%mm6      \n\t"\
101     "pxor                             %%mm7, %%mm7      \n\t"\
102     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103     ASMALIGN(4) \
104     "1:                                                 \n\t"\
105     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
106     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
107     "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
108     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
109     "movq                             %%mm0, %%mm3      \n\t"\
110     "punpcklwd                        %%mm1, %%mm0      \n\t"\
111     "punpckhwd                        %%mm1, %%mm3      \n\t"\
112     "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
113     "pmaddwd                          %%mm1, %%mm0      \n\t"\
114     "pmaddwd                          %%mm1, %%mm3      \n\t"\
115     "paddd                            %%mm0, %%mm4      \n\t"\
116     "paddd                            %%mm3, %%mm5      \n\t"\
117     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
118     "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
119     "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
120     "test                         %%"REG_S", %%"REG_S"  \n\t"\
121     "movq                             %%mm2, %%mm0      \n\t"\
122     "punpcklwd                        %%mm3, %%mm2      \n\t"\
123     "punpckhwd                        %%mm3, %%mm0      \n\t"\
124     "pmaddwd                          %%mm1, %%mm2      \n\t"\
125     "pmaddwd                          %%mm1, %%mm0      \n\t"\
126     "paddd                            %%mm2, %%mm6      \n\t"\
127     "paddd                            %%mm0, %%mm7      \n\t"\
128     " jnz                                1b             \n\t"\
129     "psrad                              $16, %%mm4      \n\t"\
130     "psrad                              $16, %%mm5      \n\t"\
131     "psrad                              $16, %%mm6      \n\t"\
132     "psrad                              $16, %%mm7      \n\t"\
133     "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
134     "packssdw                         %%mm5, %%mm4      \n\t"\
135     "packssdw                         %%mm7, %%mm6      \n\t"\
136     "paddw                            %%mm0, %%mm4      \n\t"\
137     "paddw                            %%mm0, %%mm6      \n\t"\
138     "psraw                               $3, %%mm4      \n\t"\
139     "psraw                               $3, %%mm6      \n\t"\
140     "packuswb                         %%mm6, %%mm4      \n\t"\
141     MOVNTQ(%%mm4, (%1, %%REGa))\
142     "add                                 $8, %%"REG_a"  \n\t"\
143     "cmp                                 %2, %%"REG_a"  \n\t"\
144     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
145     "pxor                             %%mm4, %%mm4      \n\t"\
146     "pxor                             %%mm5, %%mm5      \n\t"\
147     "pxor                             %%mm6, %%mm6      \n\t"\
148     "pxor                             %%mm7, %%mm7      \n\t"\
149     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
150     "jb                                  1b             \n\t"\
151     :: "r" (&c->redDither),\
152     "r" (dest), "g" (width)\
153     : "%"REG_a, "%"REG_d, "%"REG_S\
154     );
155
156 #define YSCALEYUV2YV121 \
157     "mov %2, %%"REG_a"                    \n\t"\
158     ASMALIGN(4) /* FIXME Unroll? */\
159     "1:                                   \n\t"\
160     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
161     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
162     "psraw                 $7, %%mm0      \n\t"\
163     "psraw                 $7, %%mm1      \n\t"\
164     "packuswb           %%mm1, %%mm0      \n\t"\
165     MOVNTQ(%%mm0, (%1, %%REGa))\
166     "add                   $8, %%"REG_a"  \n\t"\
167     "jnc                   1b             \n\t"
168
169 #define YSCALEYUV2YV121_ACCURATE \
170     "mov %2, %%"REG_a"                    \n\t"\
171     "pcmpeqw %%mm7, %%mm7                 \n\t"\
172     "psrlw                 $15, %%mm7     \n\t"\
173     "psllw                  $6, %%mm7     \n\t"\
174     ASMALIGN(4) /* FIXME Unroll? */\
175     "1:                                   \n\t"\
176     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
177     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
178     "paddsw             %%mm7, %%mm0      \n\t"\
179     "paddsw             %%mm7, %%mm1      \n\t"\
180     "psraw                 $7, %%mm0      \n\t"\
181     "psraw                 $7, %%mm1      \n\t"\
182     "packuswb           %%mm1, %%mm0      \n\t"\
183     MOVNTQ(%%mm0, (%1, %%REGa))\
184     "add                   $8, %%"REG_a"  \n\t"\
185     "jnc                   1b             \n\t"
186
187 /*
188     :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189        "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190        "r" (dest), "m" (dstW),
191        "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192     : "%eax", "%ebx", "%ecx", "%edx", "%esi"
193 */
194 #define YSCALEYUV2PACKEDX_UV \
195     __asm__ volatile(\
196     "xor                   %%"REG_a", %%"REG_a"     \n\t"\
197     ASMALIGN(4)\
198     "nop                                            \n\t"\
199     "1:                                             \n\t"\
200     "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
201     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
202     "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
203     "movq                      %%mm3, %%mm4         \n\t"\
204     ASMALIGN(4)\
205     "2:                                             \n\t"\
206     "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
207     "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
208     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
209     "add                         $16, %%"REG_d"     \n\t"\
210     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
211     "pmulhw                    %%mm0, %%mm2         \n\t"\
212     "pmulhw                    %%mm0, %%mm5         \n\t"\
213     "paddw                     %%mm2, %%mm3         \n\t"\
214     "paddw                     %%mm5, %%mm4         \n\t"\
215     "test                  %%"REG_S", %%"REG_S"     \n\t"\
216     " jnz                         2b                \n\t"\
217
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219     "lea                "offset"(%0), %%"REG_d"     \n\t"\
220     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
221     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
222     "movq                    "#dst1", "#dst2"       \n\t"\
223     ASMALIGN(4)\
224     "2:                                             \n\t"\
225     "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
226     "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
227     "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
228     "add                         $16, %%"REG_d"            \n\t"\
229     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
230     "pmulhw                 "#coeff", "#src1"       \n\t"\
231     "pmulhw                 "#coeff", "#src2"       \n\t"\
232     "paddw                   "#src1", "#dst1"       \n\t"\
233     "paddw                   "#src2", "#dst2"       \n\t"\
234     "test                  %%"REG_S", %%"REG_S"     \n\t"\
235     " jnz                         2b                \n\t"\
236
237 #define YSCALEYUV2PACKEDX \
238     YSCALEYUV2PACKEDX_UV \
239     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
240
241 #define YSCALEYUV2PACKEDX_END                 \
242     :: "r" (&c->redDither),                   \
243         "m" (dummy), "m" (dummy), "m" (dummy),\
244         "r" (dest), "m" (dstW)                \
245     : "%"REG_a, "%"REG_d, "%"REG_S            \
246     );
247
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
249     __asm__ volatile(\
250     "xor %%"REG_a", %%"REG_a"                       \n\t"\
251     ASMALIGN(4)\
252     "nop                                            \n\t"\
253     "1:                                             \n\t"\
254     "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
255     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
256     "pxor                      %%mm4, %%mm4         \n\t"\
257     "pxor                      %%mm5, %%mm5         \n\t"\
258     "pxor                      %%mm6, %%mm6         \n\t"\
259     "pxor                      %%mm7, %%mm7         \n\t"\
260     ASMALIGN(4)\
261     "2:                                             \n\t"\
262     "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
263     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
264     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
265     "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
266     "movq                      %%mm0, %%mm3         \n\t"\
267     "punpcklwd                 %%mm1, %%mm0         \n\t"\
268     "punpckhwd                 %%mm1, %%mm3         \n\t"\
269     "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
270     "pmaddwd                   %%mm1, %%mm0         \n\t"\
271     "pmaddwd                   %%mm1, %%mm3         \n\t"\
272     "paddd                     %%mm0, %%mm4         \n\t"\
273     "paddd                     %%mm3, %%mm5         \n\t"\
274     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
275     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
276     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
277     "test                  %%"REG_S", %%"REG_S"     \n\t"\
278     "movq                      %%mm2, %%mm0         \n\t"\
279     "punpcklwd                 %%mm3, %%mm2         \n\t"\
280     "punpckhwd                 %%mm3, %%mm0         \n\t"\
281     "pmaddwd                   %%mm1, %%mm2         \n\t"\
282     "pmaddwd                   %%mm1, %%mm0         \n\t"\
283     "paddd                     %%mm2, %%mm6         \n\t"\
284     "paddd                     %%mm0, %%mm7         \n\t"\
285     " jnz                         2b                \n\t"\
286     "psrad                       $16, %%mm4         \n\t"\
287     "psrad                       $16, %%mm5         \n\t"\
288     "psrad                       $16, %%mm6         \n\t"\
289     "psrad                       $16, %%mm7         \n\t"\
290     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
291     "packssdw                  %%mm5, %%mm4         \n\t"\
292     "packssdw                  %%mm7, %%mm6         \n\t"\
293     "paddw                     %%mm0, %%mm4         \n\t"\
294     "paddw                     %%mm0, %%mm6         \n\t"\
295     "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
296     "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
297
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299     "lea                "offset"(%0), %%"REG_d"     \n\t"\
300     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
301     "pxor                      %%mm1, %%mm1         \n\t"\
302     "pxor                      %%mm5, %%mm5         \n\t"\
303     "pxor                      %%mm7, %%mm7         \n\t"\
304     "pxor                      %%mm6, %%mm6         \n\t"\
305     ASMALIGN(4)\
306     "2:                                             \n\t"\
307     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
308     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
309     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
310     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
311     "movq                      %%mm0, %%mm3         \n\t"\
312     "punpcklwd                 %%mm4, %%mm0         \n\t"\
313     "punpckhwd                 %%mm4, %%mm3         \n\t"\
314     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
315     "pmaddwd                   %%mm4, %%mm0         \n\t"\
316     "pmaddwd                   %%mm4, %%mm3         \n\t"\
317     "paddd                     %%mm0, %%mm1         \n\t"\
318     "paddd                     %%mm3, %%mm5         \n\t"\
319     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
320     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
321     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
322     "test                  %%"REG_S", %%"REG_S"     \n\t"\
323     "movq                      %%mm2, %%mm0         \n\t"\
324     "punpcklwd                 %%mm3, %%mm2         \n\t"\
325     "punpckhwd                 %%mm3, %%mm0         \n\t"\
326     "pmaddwd                   %%mm4, %%mm2         \n\t"\
327     "pmaddwd                   %%mm4, %%mm0         \n\t"\
328     "paddd                     %%mm2, %%mm7         \n\t"\
329     "paddd                     %%mm0, %%mm6         \n\t"\
330     " jnz                         2b                \n\t"\
331     "psrad                       $16, %%mm1         \n\t"\
332     "psrad                       $16, %%mm5         \n\t"\
333     "psrad                       $16, %%mm7         \n\t"\
334     "psrad                       $16, %%mm6         \n\t"\
335     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
336     "packssdw                  %%mm5, %%mm1         \n\t"\
337     "packssdw                  %%mm6, %%mm7         \n\t"\
338     "paddw                     %%mm0, %%mm1         \n\t"\
339     "paddw                     %%mm0, %%mm7         \n\t"\
340     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
341     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
342
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344     YSCALEYUV2PACKEDX_ACCURATE_UV \
345     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346
347 #define YSCALEYUV2RGBX \
348     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
349     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
350     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
351     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
352     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
353     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
354 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
356     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
357     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
358     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
359     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
360     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
361 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362     "paddw           %%mm3, %%mm4       \n\t"\
363     "movq            %%mm2, %%mm0       \n\t"\
364     "movq            %%mm5, %%mm6       \n\t"\
365     "movq            %%mm4, %%mm3       \n\t"\
366     "punpcklwd       %%mm2, %%mm2       \n\t"\
367     "punpcklwd       %%mm5, %%mm5       \n\t"\
368     "punpcklwd       %%mm4, %%mm4       \n\t"\
369     "paddw           %%mm1, %%mm2       \n\t"\
370     "paddw           %%mm1, %%mm5       \n\t"\
371     "paddw           %%mm1, %%mm4       \n\t"\
372     "punpckhwd       %%mm0, %%mm0       \n\t"\
373     "punpckhwd       %%mm6, %%mm6       \n\t"\
374     "punpckhwd       %%mm3, %%mm3       \n\t"\
375     "paddw           %%mm7, %%mm0       \n\t"\
376     "paddw           %%mm7, %%mm6       \n\t"\
377     "paddw           %%mm7, %%mm3       \n\t"\
378     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379     "packuswb        %%mm0, %%mm2       \n\t"\
380     "packuswb        %%mm6, %%mm5       \n\t"\
381     "packuswb        %%mm3, %%mm4       \n\t"\
382
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
385     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
386     "psraw                $3, %%mm0                           \n\t"\
387     "psraw                $3, %%mm1                           \n\t"\
388     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390     "xor            "#index", "#index"                        \n\t"\
391     ASMALIGN(4)\
392     "1:                                 \n\t"\
393     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
394     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
395     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
396     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
397     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
400     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
407     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
408     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
409     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
410     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
411     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
412     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
418
419 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
420
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422     "xor            "#index", "#index"  \n\t"\
423     ASMALIGN(4)\
424     "1:                                 \n\t"\
425     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
428     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
429     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
432     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
439     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
440     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
441     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
442     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
443     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
444     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
445
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447     "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
448     "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
449     "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
450     "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
451     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
452     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
453     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
462     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
463     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
464     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
465     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
466     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
467     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468     "paddw             %%mm3, %%mm4     \n\t"\
469     "movq              %%mm2, %%mm0     \n\t"\
470     "movq              %%mm5, %%mm6     \n\t"\
471     "movq              %%mm4, %%mm3     \n\t"\
472     "punpcklwd         %%mm2, %%mm2     \n\t"\
473     "punpcklwd         %%mm5, %%mm5     \n\t"\
474     "punpcklwd         %%mm4, %%mm4     \n\t"\
475     "paddw             %%mm1, %%mm2     \n\t"\
476     "paddw             %%mm1, %%mm5     \n\t"\
477     "paddw             %%mm1, %%mm4     \n\t"\
478     "punpckhwd         %%mm0, %%mm0     \n\t"\
479     "punpckhwd         %%mm6, %%mm6     \n\t"\
480     "punpckhwd         %%mm3, %%mm3     \n\t"\
481     "paddw             %%mm7, %%mm0     \n\t"\
482     "paddw             %%mm7, %%mm6     \n\t"\
483     "paddw             %%mm7, %%mm3     \n\t"\
484     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485     "packuswb          %%mm0, %%mm2     \n\t"\
486     "packuswb          %%mm6, %%mm5     \n\t"\
487     "packuswb          %%mm3, %%mm4     \n\t"\
488
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
490
491 #define YSCALEYUV2RGB(index, c) \
492     REAL_YSCALEYUV2RGB_UV(index, c) \
493     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494     REAL_YSCALEYUV2RGB_COEFF(c)
495
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497     "xor            "#index", "#index"  \n\t"\
498     ASMALIGN(4)\
499     "1:                                 \n\t"\
500     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
501     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
502     "psraw                $7, %%mm3     \n\t" \
503     "psraw                $7, %%mm4     \n\t" \
504     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
505     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
506     "psraw                $7, %%mm1     \n\t" \
507     "psraw                $7, %%mm7     \n\t" \
508
509 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
510
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512     "xor            "#index", "#index"  \n\t"\
513     ASMALIGN(4)\
514     "1:                                 \n\t"\
515     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
516     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
517     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
520     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
521     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
522     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
523     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
524     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
525     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
527     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
528     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
531     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
532     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
533     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
534     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
535     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
536     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537     "paddw             %%mm3, %%mm4     \n\t"\
538     "movq              %%mm2, %%mm0     \n\t"\
539     "movq              %%mm5, %%mm6     \n\t"\
540     "movq              %%mm4, %%mm3     \n\t"\
541     "punpcklwd         %%mm2, %%mm2     \n\t"\
542     "punpcklwd         %%mm5, %%mm5     \n\t"\
543     "punpcklwd         %%mm4, %%mm4     \n\t"\
544     "paddw             %%mm1, %%mm2     \n\t"\
545     "paddw             %%mm1, %%mm5     \n\t"\
546     "paddw             %%mm1, %%mm4     \n\t"\
547     "punpckhwd         %%mm0, %%mm0     \n\t"\
548     "punpckhwd         %%mm6, %%mm6     \n\t"\
549     "punpckhwd         %%mm3, %%mm3     \n\t"\
550     "paddw             %%mm7, %%mm0     \n\t"\
551     "paddw             %%mm7, %%mm6     \n\t"\
552     "paddw             %%mm7, %%mm3     \n\t"\
553     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554     "packuswb          %%mm0, %%mm2     \n\t"\
555     "packuswb          %%mm6, %%mm5     \n\t"\
556     "packuswb          %%mm3, %%mm4     \n\t"\
557
558 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
559
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561     "xor "#index", "#index"             \n\t"\
562     ASMALIGN(4)\
563     "1:                                 \n\t"\
564     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
565     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
566     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
567     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
568     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570     "psrlw                $8, %%mm3     \n\t" \
571     "psrlw                $8, %%mm4     \n\t" \
572     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
573     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
574     "psraw                $7, %%mm1     \n\t" \
575     "psraw                $7, %%mm7     \n\t"
576 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
577
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580     "xor            "#index", "#index"  \n\t"\
581     ASMALIGN(4)\
582     "1:                                 \n\t"\
583     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
584     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
585     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
586     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
587     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
590     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
591     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
592     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
593     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
594     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
595     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
596     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
597     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
599     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
600     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
603     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
604     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
605     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
606     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
607     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
608     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609     "paddw             %%mm3, %%mm4     \n\t"\
610     "movq              %%mm2, %%mm0     \n\t"\
611     "movq              %%mm5, %%mm6     \n\t"\
612     "movq              %%mm4, %%mm3     \n\t"\
613     "punpcklwd         %%mm2, %%mm2     \n\t"\
614     "punpcklwd         %%mm5, %%mm5     \n\t"\
615     "punpcklwd         %%mm4, %%mm4     \n\t"\
616     "paddw             %%mm1, %%mm2     \n\t"\
617     "paddw             %%mm1, %%mm5     \n\t"\
618     "paddw             %%mm1, %%mm4     \n\t"\
619     "punpckhwd         %%mm0, %%mm0     \n\t"\
620     "punpckhwd         %%mm6, %%mm6     \n\t"\
621     "punpckhwd         %%mm3, %%mm3     \n\t"\
622     "paddw             %%mm7, %%mm0     \n\t"\
623     "paddw             %%mm7, %%mm6     \n\t"\
624     "paddw             %%mm7, %%mm3     \n\t"\
625     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626     "packuswb          %%mm0, %%mm2     \n\t"\
627     "packuswb          %%mm6, %%mm5     \n\t"\
628     "packuswb          %%mm3, %%mm4     \n\t"\
629
630 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
631
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633     "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
634     "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
635     "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
636     "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
637     "packuswb          %%mm1, %%mm7     \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641     "movq       "#b", "#q2"     \n\t" /* B */\
642     "movq       "#r", "#t"      \n\t" /* R */\
643     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
644     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
645     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
646     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
647     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
648     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
649     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
650     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
651     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
652     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
653 \
654     MOVNTQ(   q0,   (dst, index, 4))\
655     MOVNTQ(    b,  8(dst, index, 4))\
656     MOVNTQ(   q2, 16(dst, index, 4))\
657     MOVNTQ(   q3, 24(dst, index, 4))\
658 \
659     "add      $8, "#index"      \n\t"\
660     "cmp "#dstw", "#index"      \n\t"\
661     " jb      1b                \n\t"
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
663
664 #define REAL_WRITERGB16(dst, dstw, index) \
665     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
666     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
667     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
668     "psrlq           $3, %%mm2  \n\t"\
669 \
670     "movq         %%mm2, %%mm1  \n\t"\
671     "movq         %%mm4, %%mm3  \n\t"\
672 \
673     "punpcklbw    %%mm7, %%mm3  \n\t"\
674     "punpcklbw    %%mm5, %%mm2  \n\t"\
675     "punpckhbw    %%mm7, %%mm4  \n\t"\
676     "punpckhbw    %%mm5, %%mm1  \n\t"\
677 \
678     "psllq           $3, %%mm3  \n\t"\
679     "psllq           $3, %%mm4  \n\t"\
680 \
681     "por          %%mm3, %%mm2  \n\t"\
682     "por          %%mm4, %%mm1  \n\t"\
683 \
684     MOVNTQ(%%mm2,  (dst, index, 2))\
685     MOVNTQ(%%mm1, 8(dst, index, 2))\
686 \
687     "add             $8, "#index"   \n\t"\
688     "cmp        "#dstw", "#index"   \n\t"\
689     " jb             1b             \n\t"
690 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
691
692 #define REAL_WRITERGB15(dst, dstw, index) \
693     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
694     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
695     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
696     "psrlq           $3, %%mm2  \n\t"\
697     "psrlq           $1, %%mm5  \n\t"\
698 \
699     "movq         %%mm2, %%mm1  \n\t"\
700     "movq         %%mm4, %%mm3  \n\t"\
701 \
702     "punpcklbw    %%mm7, %%mm3  \n\t"\
703     "punpcklbw    %%mm5, %%mm2  \n\t"\
704     "punpckhbw    %%mm7, %%mm4  \n\t"\
705     "punpckhbw    %%mm5, %%mm1  \n\t"\
706 \
707     "psllq           $2, %%mm3  \n\t"\
708     "psllq           $2, %%mm4  \n\t"\
709 \
710     "por          %%mm3, %%mm2  \n\t"\
711     "por          %%mm4, %%mm1  \n\t"\
712 \
713     MOVNTQ(%%mm2,  (dst, index, 2))\
714     MOVNTQ(%%mm1, 8(dst, index, 2))\
715 \
716     "add             $8, "#index"   \n\t"\
717     "cmp        "#dstw", "#index"   \n\t"\
718     " jb             1b             \n\t"
719 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
720
721 #define WRITEBGR24OLD(dst, dstw, index) \
722     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723     "movq      %%mm2, %%mm1             \n\t" /* B */\
724     "movq      %%mm5, %%mm6             \n\t" /* R */\
725     "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
726     "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
727     "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
728     "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
729     "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
730     "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
731     "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
732     "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
733     "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
734     "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
735 \
736     "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
737     "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
738     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
739     "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
740     "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
741     "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
742     "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
743     "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
744 \
745     "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
746     "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
747     "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
748     "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
749     "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
750     "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
751     "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
752     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
753     "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
754     "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
755     "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
756     "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
757     "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
758 \
759     "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
760     "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
761     "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
762     "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
763     "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
764     "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
765     "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
766     "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
767 \
768     MOVNTQ(%%mm0,   (dst))\
769     MOVNTQ(%%mm2,  8(dst))\
770     MOVNTQ(%%mm3, 16(dst))\
771     "add         $24, "#dst"            \n\t"\
772 \
773     "add          $8, "#index"          \n\t"\
774     "cmp     "#dstw", "#index"          \n\t"\
775     " jb          1b                    \n\t"
776
777 #define WRITEBGR24MMX(dst, dstw, index) \
778     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779     "movq      %%mm2, %%mm1     \n\t" /* B */\
780     "movq      %%mm5, %%mm6     \n\t" /* R */\
781     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
782     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
783     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
784     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
785     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
786     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
787     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
788     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
789     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
790     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
791 \
792     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
793     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
794     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
795     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
796 \
797     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
798     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
799     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
800     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
801 \
802     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
803     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
804     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
805     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
806 \
807     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
808     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
809     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
810     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
811     MOVNTQ(%%mm0, (dst))\
812 \
813     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
814     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
815     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
816     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
817     MOVNTQ(%%mm6, 8(dst))\
818 \
819     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
820     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
821     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
822     MOVNTQ(%%mm5, 16(dst))\
823 \
824     "add         $24, "#dst"    \n\t"\
825 \
826     "add          $8, "#index"  \n\t"\
827     "cmp     "#dstw", "#index"  \n\t"\
828     " jb          1b            \n\t"
829
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
835     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
836     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
837 \
838     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
839     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
840     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
841 \
842     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
843     "por    %%mm1, %%mm6        \n\t"\
844     "por    %%mm3, %%mm6        \n\t"\
845     MOVNTQ(%%mm6, (dst))\
846 \
847     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
848     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
849     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
850     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
851 \
852     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
853     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
854     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
855 \
856     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
857     "por    %%mm3, %%mm6        \n\t"\
858     MOVNTQ(%%mm6, 8(dst))\
859 \
860     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
861     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
862     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
863 \
864     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
865     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
866     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
867 \
868     "por    %%mm1, %%mm3        \n\t"\
869     "por    %%mm3, %%mm6        \n\t"\
870     MOVNTQ(%%mm6, 16(dst))\
871 \
872     "add      $24, "#dst"       \n\t"\
873 \
874     "add       $8, "#index"     \n\t"\
875     "cmp  "#dstw", "#index"     \n\t"\
876     " jb       1b               \n\t"
877
878 #if COMPILE_TEMPLATE_MMX2
879 #undef WRITEBGR24
880 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
881 #else
882 #undef WRITEBGR24
883 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
884 #endif
885
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887     "packuswb  %%mm3, %%mm3     \n\t"\
888     "packuswb  %%mm4, %%mm4     \n\t"\
889     "packuswb  %%mm7, %%mm1     \n\t"\
890     "punpcklbw %%mm4, %%mm3     \n\t"\
891     "movq      %%mm1, %%mm7     \n\t"\
892     "punpcklbw %%mm3, %%mm1     \n\t"\
893     "punpckhbw %%mm3, %%mm7     \n\t"\
894 \
895     MOVNTQ(%%mm1, (dst, index, 2))\
896     MOVNTQ(%%mm7, 8(dst, index, 2))\
897 \
898     "add          $8, "#index"  \n\t"\
899     "cmp     "#dstw", "#index"  \n\t"\
900     " jb          1b            \n\t"
901 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
902
903
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
907 {
908 #if COMPILE_TEMPLATE_MMX
909     if(!(c->flags & SWS_BITEXACT)){
910         if (c->flags & SWS_ACCURATE_RND){
911             if (uDest){
912                 YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913                 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914             }
915             if (CONFIG_SWSCALE_ALPHA && aDest){
916                 YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917             }
918
919             YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920         }else{
921             if (uDest){
922                 YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923                 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924             }
925             if (CONFIG_SWSCALE_ALPHA && aDest){
926                 YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927             }
928
929             YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930         }
931         return;
932     }
933 #endif
934 #if COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936                       chrFilter, chrSrc, chrFilterSize,
937                       dest, uDest, vDest, dstW, chrDstW);
938 #else //COMPILE_TEMPLATE_ALTIVEC
939 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940             chrFilter, chrSrc, chrFilterSize,
941             alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!COMPILE_TEMPLATE_ALTIVEC
943 }
944
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946                                      const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
948 {
949 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950              chrFilter, chrSrc, chrFilterSize,
951              dest, uDest, dstW, chrDstW, dstFormat);
952 }
953
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
956 {
957     int i;
958 #if COMPILE_TEMPLATE_MMX
959     if(!(c->flags & SWS_BITEXACT)){
960         long p= 4;
961         uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962         uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963         x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
964
965         if (c->flags & SWS_ACCURATE_RND){
966             while(p--){
967                 if (dst[p]){
968                     __asm__ volatile(
969                         YSCALEYUV2YV121_ACCURATE
970                         :: "r" (src[p]), "r" (dst[p] + counter[p]),
971                         "g" (-counter[p])
972                         : "%"REG_a
973                     );
974                 }
975             }
976         }else{
977             while(p--){
978                 if (dst[p]){
979                     __asm__ volatile(
980                         YSCALEYUV2YV121
981                         :: "r" (src[p]), "r" (dst[p] + counter[p]),
982                         "g" (-counter[p])
983                         : "%"REG_a
984                     );
985                 }
986             }
987         }
988         return;
989     }
990 #endif
991     for (i=0; i<dstW; i++)
992     {
993         int val= (lumSrc[i]+64)>>7;
994
995         if (val&256){
996             if (val<0) val=0;
997             else       val=255;
998         }
999
1000         dest[i]= val;
1001     }
1002
1003     if (uDest)
1004         for (i=0; i<chrDstW; i++)
1005         {
1006             int u=(chrSrc[i       ]+64)>>7;
1007             int v=(chrSrc[i + VOFW]+64)>>7;
1008
1009             if ((u|v)&256){
1010                 if (u<0)        u=0;
1011                 else if (u>255) u=255;
1012                 if (v<0)        v=0;
1013                 else if (v>255) v=255;
1014             }
1015
1016             uDest[i]= u;
1017             vDest[i]= v;
1018         }
1019
1020     if (CONFIG_SWSCALE_ALPHA && aDest)
1021         for (i=0; i<dstW; i++){
1022             int val= (alpSrc[i]+64)>>7;
1023             aDest[i]= av_clip_uint8(val);
1024         }
1025 }
1026
1027
1028 /**
1029  * vertical scale YV12 to RGB
1030  */
1031 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1032                                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1033                                        const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1034 {
1035 #if COMPILE_TEMPLATE_MMX
1036     x86_reg dummy=0;
1037     if(!(c->flags & SWS_BITEXACT)){
1038         if (c->flags & SWS_ACCURATE_RND){
1039             switch(c->dstFormat){
1040             case PIX_FMT_RGB32:
1041                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1042                     YSCALEYUV2PACKEDX_ACCURATE
1043                     YSCALEYUV2RGBX
1044                     "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1045                     "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1046                     "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1047                     YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1048                     "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1049                     "psraw                        $3, %%mm1         \n\t"
1050                     "psraw                        $3, %%mm7         \n\t"
1051                     "packuswb                  %%mm7, %%mm1         \n\t"
1052                     WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1053
1054                     YSCALEYUV2PACKEDX_END
1055                 }else{
1056                     YSCALEYUV2PACKEDX_ACCURATE
1057                     YSCALEYUV2RGBX
1058                     "pcmpeqd %%mm7, %%mm7 \n\t"
1059                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1060
1061                     YSCALEYUV2PACKEDX_END
1062                 }
1063                 return;
1064             case PIX_FMT_BGR24:
1065                 YSCALEYUV2PACKEDX_ACCURATE
1066                 YSCALEYUV2RGBX
1067                 "pxor %%mm7, %%mm7 \n\t"
1068                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1069                 "add %4, %%"REG_c"                        \n\t"
1070                 WRITEBGR24(%%REGc, %5, %%REGa)
1071
1072
1073                 :: "r" (&c->redDither),
1074                 "m" (dummy), "m" (dummy), "m" (dummy),
1075                 "r" (dest), "m" (dstW)
1076                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077                 );
1078                 return;
1079             case PIX_FMT_RGB555:
1080                 YSCALEYUV2PACKEDX_ACCURATE
1081                 YSCALEYUV2RGBX
1082                 "pxor %%mm7, %%mm7 \n\t"
1083                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1084 #ifdef DITHER1XBPP
1085                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1086                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1087                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1088 #endif
1089
1090                 WRITERGB15(%4, %5, %%REGa)
1091                 YSCALEYUV2PACKEDX_END
1092                 return;
1093             case PIX_FMT_RGB565:
1094                 YSCALEYUV2PACKEDX_ACCURATE
1095                 YSCALEYUV2RGBX
1096                 "pxor %%mm7, %%mm7 \n\t"
1097                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1098 #ifdef DITHER1XBPP
1099                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1100                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1101                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1102 #endif
1103
1104                 WRITERGB16(%4, %5, %%REGa)
1105                 YSCALEYUV2PACKEDX_END
1106                 return;
1107             case PIX_FMT_YUYV422:
1108                 YSCALEYUV2PACKEDX_ACCURATE
1109                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1110
1111                 "psraw $3, %%mm3    \n\t"
1112                 "psraw $3, %%mm4    \n\t"
1113                 "psraw $3, %%mm1    \n\t"
1114                 "psraw $3, %%mm7    \n\t"
1115                 WRITEYUY2(%4, %5, %%REGa)
1116                 YSCALEYUV2PACKEDX_END
1117                 return;
1118             }
1119         }else{
1120             switch(c->dstFormat)
1121             {
1122             case PIX_FMT_RGB32:
1123                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1124                     YSCALEYUV2PACKEDX
1125                     YSCALEYUV2RGBX
1126                     YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1127                     "psraw                        $3, %%mm1         \n\t"
1128                     "psraw                        $3, %%mm7         \n\t"
1129                     "packuswb                  %%mm7, %%mm1         \n\t"
1130                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1131                     YSCALEYUV2PACKEDX_END
1132                 }else{
1133                     YSCALEYUV2PACKEDX
1134                     YSCALEYUV2RGBX
1135                     "pcmpeqd %%mm7, %%mm7 \n\t"
1136                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1137                     YSCALEYUV2PACKEDX_END
1138                 }
1139                 return;
1140             case PIX_FMT_BGR24:
1141                 YSCALEYUV2PACKEDX
1142                 YSCALEYUV2RGBX
1143                 "pxor                    %%mm7, %%mm7       \n\t"
1144                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1145                 "add                        %4, %%"REG_c"   \n\t"
1146                 WRITEBGR24(%%REGc, %5, %%REGa)
1147
1148                 :: "r" (&c->redDither),
1149                 "m" (dummy), "m" (dummy), "m" (dummy),
1150                 "r" (dest),  "m" (dstW)
1151                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1152                 );
1153                 return;
1154             case PIX_FMT_RGB555:
1155                 YSCALEYUV2PACKEDX
1156                 YSCALEYUV2RGBX
1157                 "pxor %%mm7, %%mm7 \n\t"
1158                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1159 #ifdef DITHER1XBPP
1160                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1161                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1162                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1163 #endif
1164
1165                 WRITERGB15(%4, %5, %%REGa)
1166                 YSCALEYUV2PACKEDX_END
1167                 return;
1168             case PIX_FMT_RGB565:
1169                 YSCALEYUV2PACKEDX
1170                 YSCALEYUV2RGBX
1171                 "pxor %%mm7, %%mm7 \n\t"
1172                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1173 #ifdef DITHER1XBPP
1174                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1175                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1176                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1177 #endif
1178
1179                 WRITERGB16(%4, %5, %%REGa)
1180                 YSCALEYUV2PACKEDX_END
1181                 return;
1182             case PIX_FMT_YUYV422:
1183                 YSCALEYUV2PACKEDX
1184                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1185
1186                 "psraw $3, %%mm3    \n\t"
1187                 "psraw $3, %%mm4    \n\t"
1188                 "psraw $3, %%mm1    \n\t"
1189                 "psraw $3, %%mm7    \n\t"
1190                 WRITEYUY2(%4, %5, %%REGa)
1191                 YSCALEYUV2PACKEDX_END
1192                 return;
1193             }
1194         }
1195     }
1196 #endif /* COMPILE_TEMPLATE_MMX */
1197 #if COMPILE_TEMPLATE_ALTIVEC
1198     /* The following list of supported dstFormat values should
1199        match what's found in the body of ff_yuv2packedX_altivec() */
1200     if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1201        (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1202         c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1203         c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1204             ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1205                                    chrFilter, chrSrc, chrFilterSize,
1206                                    dest, dstW, dstY);
1207     else
1208 #endif
1209         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1210                        chrFilter, chrSrc, chrFilterSize,
1211                        alpSrc, dest, dstW, dstY);
1212 }
1213
1214 /**
1215  * vertical bilinear scale YV12 to RGB
1216  */
1217 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1218                           const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1219 {
1220     int  yalpha1=4095- yalpha;
1221     int uvalpha1=4095-uvalpha;
1222     int i;
1223
1224 #if COMPILE_TEMPLATE_MMX
1225     if(!(c->flags & SWS_BITEXACT)){
1226         switch(c->dstFormat)
1227         {
1228             //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1229             case PIX_FMT_RGB32:
1230                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1231 #if ARCH_X86_64
1232                     __asm__ volatile(
1233                     YSCALEYUV2RGB(%%REGBP, %5)
1234                     YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1235                     "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1236                     "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1237                     "packuswb            %%mm7, %%mm1       \n\t"
1238                     WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1239
1240                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1241                     "a" (&c->redDither)
1242                     ,"r" (abuf0), "r" (abuf1)
1243                     : "%"REG_BP
1244                     );
1245 #else
1246                     *(uint16_t **)(&c->u_temp)=abuf0;
1247                     *(uint16_t **)(&c->v_temp)=abuf1;
1248                     __asm__ volatile(
1249                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1250                     "mov        %4, %%"REG_b"               \n\t"
1251                     "push %%"REG_BP"                        \n\t"
1252                     YSCALEYUV2RGB(%%REGBP, %5)
1253                     "push                   %0              \n\t"
1254                     "push                   %1              \n\t"
1255                     "mov          "U_TEMP"(%5), %0          \n\t"
1256                     "mov          "V_TEMP"(%5), %1          \n\t"
1257                     YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1258                     "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1259                     "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1260                     "packuswb            %%mm7, %%mm1       \n\t"
1261                     "pop                    %1              \n\t"
1262                     "pop                    %0              \n\t"
1263                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1264                     "pop %%"REG_BP"                         \n\t"
1265                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1266
1267                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1268                     "a" (&c->redDither)
1269                     );
1270 #endif
1271                 }else{
1272                     __asm__ volatile(
1273                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1274                     "mov        %4, %%"REG_b"               \n\t"
1275                     "push %%"REG_BP"                        \n\t"
1276                     YSCALEYUV2RGB(%%REGBP, %5)
1277                     "pcmpeqd %%mm7, %%mm7                   \n\t"
1278                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1279                     "pop %%"REG_BP"                         \n\t"
1280                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1281
1282                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1283                     "a" (&c->redDither)
1284                     );
1285                 }
1286                 return;
1287             case PIX_FMT_BGR24:
1288                 __asm__ volatile(
1289                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1290                 "mov        %4, %%"REG_b"               \n\t"
1291                 "push %%"REG_BP"                        \n\t"
1292                 YSCALEYUV2RGB(%%REGBP, %5)
1293                 "pxor    %%mm7, %%mm7                   \n\t"
1294                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1295                 "pop %%"REG_BP"                         \n\t"
1296                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1297                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1298                 "a" (&c->redDither)
1299                 );
1300                 return;
1301             case PIX_FMT_RGB555:
1302                 __asm__ volatile(
1303                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1304                 "mov        %4, %%"REG_b"               \n\t"
1305                 "push %%"REG_BP"                        \n\t"
1306                 YSCALEYUV2RGB(%%REGBP, %5)
1307                 "pxor    %%mm7, %%mm7                   \n\t"
1308                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1309 #ifdef DITHER1XBPP
1310                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1311                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1312                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1313 #endif
1314
1315                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1316                 "pop %%"REG_BP"                         \n\t"
1317                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1318
1319                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1320                 "a" (&c->redDither)
1321                 );
1322                 return;
1323             case PIX_FMT_RGB565:
1324                 __asm__ volatile(
1325                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1326                 "mov        %4, %%"REG_b"               \n\t"
1327                 "push %%"REG_BP"                        \n\t"
1328                 YSCALEYUV2RGB(%%REGBP, %5)
1329                 "pxor    %%mm7, %%mm7                   \n\t"
1330                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1331 #ifdef DITHER1XBPP
1332                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1333                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1334                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1335 #endif
1336
1337                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1338                 "pop %%"REG_BP"                         \n\t"
1339                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1340                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1341                 "a" (&c->redDither)
1342                 );
1343                 return;
1344             case PIX_FMT_YUYV422:
1345                 __asm__ volatile(
1346                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1347                 "mov %4, %%"REG_b"                        \n\t"
1348                 "push %%"REG_BP"                        \n\t"
1349                 YSCALEYUV2PACKED(%%REGBP, %5)
1350                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1351                 "pop %%"REG_BP"                         \n\t"
1352                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1353                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1354                 "a" (&c->redDither)
1355                 );
1356                 return;
1357             default: break;
1358         }
1359     }
1360 #endif //COMPILE_TEMPLATE_MMX
1361 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1362 }
1363
1364 /**
1365  * YV12 to RGB without scaling or interpolating
1366  */
1367 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1368                           const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1369 {
1370     const int yalpha1=0;
1371     int i;
1372
1373     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1374     const int yalpha= 4096; //FIXME ...
1375
1376     if (flags&SWS_FULL_CHR_H_INT)
1377     {
1378         c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1379         return;
1380     }
1381
1382 #if COMPILE_TEMPLATE_MMX
1383     if(!(flags & SWS_BITEXACT)){
1384         if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1385         {
1386             switch(dstFormat)
1387             {
1388             case PIX_FMT_RGB32:
1389                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1390                     __asm__ volatile(
1391                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1392                     "mov        %4, %%"REG_b"               \n\t"
1393                     "push %%"REG_BP"                        \n\t"
1394                     YSCALEYUV2RGB1(%%REGBP, %5)
1395                     YSCALEYUV2RGB1_ALPHA(%%REGBP)
1396                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1397                     "pop %%"REG_BP"                         \n\t"
1398                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1399
1400                     :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1401                     "a" (&c->redDither)
1402                     );
1403                 }else{
1404                     __asm__ volatile(
1405                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1406                     "mov        %4, %%"REG_b"               \n\t"
1407                     "push %%"REG_BP"                        \n\t"
1408                     YSCALEYUV2RGB1(%%REGBP, %5)
1409                     "pcmpeqd %%mm7, %%mm7                   \n\t"
1410                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1411                     "pop %%"REG_BP"                         \n\t"
1412                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1413
1414                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1415                     "a" (&c->redDither)
1416                     );
1417                 }
1418                 return;
1419             case PIX_FMT_BGR24:
1420                 __asm__ volatile(
1421                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1422                 "mov        %4, %%"REG_b"               \n\t"
1423                 "push %%"REG_BP"                        \n\t"
1424                 YSCALEYUV2RGB1(%%REGBP, %5)
1425                 "pxor    %%mm7, %%mm7                   \n\t"
1426                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427                 "pop %%"REG_BP"                         \n\t"
1428                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1429
1430                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1431                 "a" (&c->redDither)
1432                 );
1433                 return;
1434             case PIX_FMT_RGB555:
1435                 __asm__ volatile(
1436                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1437                 "mov        %4, %%"REG_b"               \n\t"
1438                 "push %%"REG_BP"                        \n\t"
1439                 YSCALEYUV2RGB1(%%REGBP, %5)
1440                 "pxor    %%mm7, %%mm7                   \n\t"
1441                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1442 #ifdef DITHER1XBPP
1443                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1444                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1445                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1446 #endif
1447                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1448                 "pop %%"REG_BP"                         \n\t"
1449                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1450
1451                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452                 "a" (&c->redDither)
1453                 );
1454                 return;
1455             case PIX_FMT_RGB565:
1456                 __asm__ volatile(
1457                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1458                 "mov        %4, %%"REG_b"               \n\t"
1459                 "push %%"REG_BP"                        \n\t"
1460                 YSCALEYUV2RGB1(%%REGBP, %5)
1461                 "pxor    %%mm7, %%mm7                   \n\t"
1462                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1463 #ifdef DITHER1XBPP
1464                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1465                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1466                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1467 #endif
1468
1469                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1470                 "pop %%"REG_BP"                         \n\t"
1471                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1472
1473                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1474                 "a" (&c->redDither)
1475                 );
1476                 return;
1477             case PIX_FMT_YUYV422:
1478                 __asm__ volatile(
1479                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1480                 "mov        %4, %%"REG_b"               \n\t"
1481                 "push %%"REG_BP"                        \n\t"
1482                 YSCALEYUV2PACKED1(%%REGBP, %5)
1483                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1484                 "pop %%"REG_BP"                         \n\t"
1485                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1486
1487                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488                 "a" (&c->redDither)
1489                 );
1490                 return;
1491             }
1492         }
1493         else
1494         {
1495             switch(dstFormat)
1496             {
1497             case PIX_FMT_RGB32:
1498                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1499                     __asm__ volatile(
1500                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501                     "mov        %4, %%"REG_b"               \n\t"
1502                     "push %%"REG_BP"                        \n\t"
1503                     YSCALEYUV2RGB1b(%%REGBP, %5)
1504                     YSCALEYUV2RGB1_ALPHA(%%REGBP)
1505                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506                     "pop %%"REG_BP"                         \n\t"
1507                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1508
1509                     :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510                     "a" (&c->redDither)
1511                     );
1512                 }else{
1513                     __asm__ volatile(
1514                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1515                     "mov        %4, %%"REG_b"               \n\t"
1516                     "push %%"REG_BP"                        \n\t"
1517                     YSCALEYUV2RGB1b(%%REGBP, %5)
1518                     "pcmpeqd %%mm7, %%mm7                   \n\t"
1519                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1520                     "pop %%"REG_BP"                         \n\t"
1521                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1522
1523                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524                     "a" (&c->redDither)
1525                     );
1526                 }
1527                 return;
1528             case PIX_FMT_BGR24:
1529                 __asm__ volatile(
1530                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1531                 "mov        %4, %%"REG_b"               \n\t"
1532                 "push %%"REG_BP"                        \n\t"
1533                 YSCALEYUV2RGB1b(%%REGBP, %5)
1534                 "pxor    %%mm7, %%mm7                   \n\t"
1535                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1536                 "pop %%"REG_BP"                         \n\t"
1537                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1538
1539                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1540                 "a" (&c->redDither)
1541                 );
1542                 return;
1543             case PIX_FMT_RGB555:
1544                 __asm__ volatile(
1545                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1546                 "mov        %4, %%"REG_b"               \n\t"
1547                 "push %%"REG_BP"                        \n\t"
1548                 YSCALEYUV2RGB1b(%%REGBP, %5)
1549                 "pxor    %%mm7, %%mm7                   \n\t"
1550                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1551 #ifdef DITHER1XBPP
1552                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1553                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1554                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1555 #endif
1556                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1557                 "pop %%"REG_BP"                         \n\t"
1558                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1559
1560                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1561                 "a" (&c->redDither)
1562                 );
1563                 return;
1564             case PIX_FMT_RGB565:
1565                 __asm__ volatile(
1566                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1567                 "mov        %4, %%"REG_b"               \n\t"
1568                 "push %%"REG_BP"                        \n\t"
1569                 YSCALEYUV2RGB1b(%%REGBP, %5)
1570                 "pxor    %%mm7, %%mm7                   \n\t"
1571                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1572 #ifdef DITHER1XBPP
1573                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1574                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1575                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1576 #endif
1577
1578                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1579                 "pop %%"REG_BP"                         \n\t"
1580                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1581
1582                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583                 "a" (&c->redDither)
1584                 );
1585                 return;
1586             case PIX_FMT_YUYV422:
1587                 __asm__ volatile(
1588                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1589                 "mov        %4, %%"REG_b"               \n\t"
1590                 "push %%"REG_BP"                        \n\t"
1591                 YSCALEYUV2PACKED1b(%%REGBP, %5)
1592                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1593                 "pop %%"REG_BP"                         \n\t"
1594                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1595
1596                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1597                 "a" (&c->redDither)
1598                 );
1599                 return;
1600             }
1601         }
1602     }
1603 #endif /* COMPILE_TEMPLATE_MMX */
1604     if (uvalpha < 2048)
1605     {
1606         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1607     }else{
1608         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1609     }
1610 }
1611
1612 //FIXME yuy2* can read up to 7 samples too much
1613
1614 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1615 {
1616 #if COMPILE_TEMPLATE_MMX
1617     __asm__ volatile(
1618     "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1619     "mov                    %0, %%"REG_a"       \n\t"
1620     "1:                                         \n\t"
1621     "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1622     "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1623     "pand                %%mm2, %%mm0           \n\t"
1624     "pand                %%mm2, %%mm1           \n\t"
1625     "packuswb            %%mm1, %%mm0           \n\t"
1626     "movq                %%mm0, (%2, %%"REG_a") \n\t"
1627     "add                    $8, %%"REG_a"       \n\t"
1628     " js                    1b                  \n\t"
1629     : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1630     : "%"REG_a
1631     );
1632 #else
1633     int i;
1634     for (i=0; i<width; i++)
1635         dst[i]= src[2*i];
1636 #endif
1637 }
1638
1639 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1640 {
1641 #if COMPILE_TEMPLATE_MMX
1642     __asm__ volatile(
1643     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1644     "mov                    %0, %%"REG_a"       \n\t"
1645     "1:                                         \n\t"
1646     "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1647     "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1648     "psrlw                  $8, %%mm0           \n\t"
1649     "psrlw                  $8, %%mm1           \n\t"
1650     "packuswb            %%mm1, %%mm0           \n\t"
1651     "movq                %%mm0, %%mm1           \n\t"
1652     "psrlw                  $8, %%mm0           \n\t"
1653     "pand                %%mm4, %%mm1           \n\t"
1654     "packuswb            %%mm0, %%mm0           \n\t"
1655     "packuswb            %%mm1, %%mm1           \n\t"
1656     "movd                %%mm0, (%3, %%"REG_a") \n\t"
1657     "movd                %%mm1, (%2, %%"REG_a") \n\t"
1658     "add                    $4, %%"REG_a"       \n\t"
1659     " js                    1b                  \n\t"
1660     : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1661     : "%"REG_a
1662     );
1663 #else
1664     int i;
1665     for (i=0; i<width; i++)
1666     {
1667         dstU[i]= src1[4*i + 1];
1668         dstV[i]= src1[4*i + 3];
1669     }
1670 #endif
1671     assert(src1 == src2);
1672 }
1673
1674 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1675 {
1676 #if COMPILE_TEMPLATE_MMX
1677     __asm__ volatile(
1678     "mov                    %0, %%"REG_a"       \n\t"
1679     "1:                                         \n\t"
1680     "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1681     "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1682     "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1683     "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1684     "psrlw                  $8, %%mm0           \n\t"
1685     "psrlw                  $8, %%mm1           \n\t"
1686     "psrlw                  $8, %%mm2           \n\t"
1687     "psrlw                  $8, %%mm3           \n\t"
1688     "packuswb            %%mm1, %%mm0           \n\t"
1689     "packuswb            %%mm3, %%mm2           \n\t"
1690     "movq                %%mm0, (%3, %%"REG_a") \n\t"
1691     "movq                %%mm2, (%4, %%"REG_a") \n\t"
1692     "add                    $8, %%"REG_a"       \n\t"
1693     " js                    1b                  \n\t"
1694     : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1695     : "%"REG_a
1696     );
1697 #else
1698     int i;
1699     for (i=0; i<width; i++)
1700     {
1701         dstU[i]= src1[2*i + 1];
1702         dstV[i]= src2[2*i + 1];
1703     }
1704 #endif
1705 }
1706
1707 /* This is almost identical to the previous, end exists only because
1708  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1709 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1710 {
1711 #if COMPILE_TEMPLATE_MMX
1712     __asm__ volatile(
1713     "mov                  %0, %%"REG_a"         \n\t"
1714     "1:                                         \n\t"
1715     "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1716     "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1717     "psrlw                $8, %%mm0             \n\t"
1718     "psrlw                $8, %%mm1             \n\t"
1719     "packuswb          %%mm1, %%mm0             \n\t"
1720     "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1721     "add                  $8, %%"REG_a"         \n\t"
1722     " js                  1b                    \n\t"
1723     : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1724     : "%"REG_a
1725     );
1726 #else
1727     int i;
1728     for (i=0; i<width; i++)
1729         dst[i]= src[2*i+1];
1730 #endif
1731 }
1732
1733 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1734 {
1735 #if COMPILE_TEMPLATE_MMX
1736     __asm__ volatile(
1737     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1738     "mov                    %0, %%"REG_a"       \n\t"
1739     "1:                                         \n\t"
1740     "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1741     "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1742     "pand                %%mm4, %%mm0           \n\t"
1743     "pand                %%mm4, %%mm1           \n\t"
1744     "packuswb            %%mm1, %%mm0           \n\t"
1745     "movq                %%mm0, %%mm1           \n\t"
1746     "psrlw                  $8, %%mm0           \n\t"
1747     "pand                %%mm4, %%mm1           \n\t"
1748     "packuswb            %%mm0, %%mm0           \n\t"
1749     "packuswb            %%mm1, %%mm1           \n\t"
1750     "movd                %%mm0, (%3, %%"REG_a") \n\t"
1751     "movd                %%mm1, (%2, %%"REG_a") \n\t"
1752     "add                    $4, %%"REG_a"       \n\t"
1753     " js                    1b                  \n\t"
1754     : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1755     : "%"REG_a
1756     );
1757 #else
1758     int i;
1759     for (i=0; i<width; i++)
1760     {
1761         dstU[i]= src1[4*i + 0];
1762         dstV[i]= src1[4*i + 2];
1763     }
1764 #endif
1765     assert(src1 == src2);
1766 }
1767
1768 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1769 {
1770 #if COMPILE_TEMPLATE_MMX
1771     __asm__ volatile(
1772     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1773     "mov                    %0, %%"REG_a"       \n\t"
1774     "1:                                         \n\t"
1775     "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1776     "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1777     "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1778     "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1779     "pand                %%mm4, %%mm0           \n\t"
1780     "pand                %%mm4, %%mm1           \n\t"
1781     "pand                %%mm4, %%mm2           \n\t"
1782     "pand                %%mm4, %%mm3           \n\t"
1783     "packuswb            %%mm1, %%mm0           \n\t"
1784     "packuswb            %%mm3, %%mm2           \n\t"
1785     "movq                %%mm0, (%3, %%"REG_a") \n\t"
1786     "movq                %%mm2, (%4, %%"REG_a") \n\t"
1787     "add                    $8, %%"REG_a"       \n\t"
1788     " js                    1b                  \n\t"
1789     : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1790     : "%"REG_a
1791     );
1792 #else
1793     int i;
1794     for (i=0; i<width; i++)
1795     {
1796         dstU[i]= src1[2*i];
1797         dstV[i]= src2[2*i];
1798     }
1799 #endif
1800 }
1801
1802 #if COMPILE_TEMPLATE_MMX
1803 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
1804 {
1805
1806     if(srcFormat == PIX_FMT_BGR24){
1807         __asm__ volatile(
1808             "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1809             "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1810             :
1811         );
1812     }else{
1813         __asm__ volatile(
1814             "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1815             "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1816             :
1817         );
1818     }
1819
1820     __asm__ volatile(
1821         "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1822         "mov                        %2, %%"REG_a"   \n\t"
1823         "pxor                    %%mm7, %%mm7       \n\t"
1824         "1:                                         \n\t"
1825         PREFETCH"               64(%0)              \n\t"
1826         "movd                     (%0), %%mm0       \n\t"
1827         "movd                    2(%0), %%mm1       \n\t"
1828         "movd                    6(%0), %%mm2       \n\t"
1829         "movd                    8(%0), %%mm3       \n\t"
1830         "add                       $12, %0          \n\t"
1831         "punpcklbw               %%mm7, %%mm0       \n\t"
1832         "punpcklbw               %%mm7, %%mm1       \n\t"
1833         "punpcklbw               %%mm7, %%mm2       \n\t"
1834         "punpcklbw               %%mm7, %%mm3       \n\t"
1835         "pmaddwd                 %%mm5, %%mm0       \n\t"
1836         "pmaddwd                 %%mm6, %%mm1       \n\t"
1837         "pmaddwd                 %%mm5, %%mm2       \n\t"
1838         "pmaddwd                 %%mm6, %%mm3       \n\t"
1839         "paddd                   %%mm1, %%mm0       \n\t"
1840         "paddd                   %%mm3, %%mm2       \n\t"
1841         "paddd                   %%mm4, %%mm0       \n\t"
1842         "paddd                   %%mm4, %%mm2       \n\t"
1843         "psrad                     $15, %%mm0       \n\t"
1844         "psrad                     $15, %%mm2       \n\t"
1845         "packssdw                %%mm2, %%mm0       \n\t"
1846         "packuswb                %%mm0, %%mm0       \n\t"
1847         "movd                %%mm0, (%1, %%"REG_a") \n\t"
1848         "add                        $4, %%"REG_a"   \n\t"
1849         " js                        1b              \n\t"
1850     : "+r" (src)
1851     : "r" (dst+width), "g" ((x86_reg)-width)
1852     : "%"REG_a
1853     );
1854 }
1855
1856 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
1857 {
1858     __asm__ volatile(
1859         "movq                    24+%4, %%mm6       \n\t"
1860         "mov                        %3, %%"REG_a"   \n\t"
1861         "pxor                    %%mm7, %%mm7       \n\t"
1862         "1:                                         \n\t"
1863         PREFETCH"               64(%0)              \n\t"
1864         "movd                     (%0), %%mm0       \n\t"
1865         "movd                    2(%0), %%mm1       \n\t"
1866         "punpcklbw               %%mm7, %%mm0       \n\t"
1867         "punpcklbw               %%mm7, %%mm1       \n\t"
1868         "movq                    %%mm0, %%mm2       \n\t"
1869         "movq                    %%mm1, %%mm3       \n\t"
1870         "pmaddwd                    %4, %%mm0       \n\t"
1871         "pmaddwd                  8+%4, %%mm1       \n\t"
1872         "pmaddwd                 16+%4, %%mm2       \n\t"
1873         "pmaddwd                 %%mm6, %%mm3       \n\t"
1874         "paddd                   %%mm1, %%mm0       \n\t"
1875         "paddd                   %%mm3, %%mm2       \n\t"
1876
1877         "movd                    6(%0), %%mm1       \n\t"
1878         "movd                    8(%0), %%mm3       \n\t"
1879         "add                       $12, %0          \n\t"
1880         "punpcklbw               %%mm7, %%mm1       \n\t"
1881         "punpcklbw               %%mm7, %%mm3       \n\t"
1882         "movq                    %%mm1, %%mm4       \n\t"
1883         "movq                    %%mm3, %%mm5       \n\t"
1884         "pmaddwd                    %4, %%mm1       \n\t"
1885         "pmaddwd                  8+%4, %%mm3       \n\t"
1886         "pmaddwd                 16+%4, %%mm4       \n\t"
1887         "pmaddwd                 %%mm6, %%mm5       \n\t"
1888         "paddd                   %%mm3, %%mm1       \n\t"
1889         "paddd                   %%mm5, %%mm4       \n\t"
1890
1891         "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1892         "paddd                   %%mm3, %%mm0       \n\t"
1893         "paddd                   %%mm3, %%mm2       \n\t"
1894         "paddd                   %%mm3, %%mm1       \n\t"
1895         "paddd                   %%mm3, %%mm4       \n\t"
1896         "psrad                     $15, %%mm0       \n\t"
1897         "psrad                     $15, %%mm2       \n\t"
1898         "psrad                     $15, %%mm1       \n\t"
1899         "psrad                     $15, %%mm4       \n\t"
1900         "packssdw                %%mm1, %%mm0       \n\t"
1901         "packssdw                %%mm4, %%mm2       \n\t"
1902         "packuswb                %%mm0, %%mm0       \n\t"
1903         "packuswb                %%mm2, %%mm2       \n\t"
1904         "movd                %%mm0, (%1, %%"REG_a") \n\t"
1905         "movd                %%mm2, (%2, %%"REG_a") \n\t"
1906         "add                        $4, %%"REG_a"   \n\t"
1907         " js                        1b              \n\t"
1908     : "+r" (src)
1909     : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1910     : "%"REG_a
1911     );
1912 }
1913 #endif
1914
1915 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1916 {
1917 #if COMPILE_TEMPLATE_MMX
1918     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1919 #else
1920     int i;
1921     for (i=0; i<width; i++)
1922     {
1923         int b= src[i*3+0];
1924         int g= src[i*3+1];
1925         int r= src[i*3+2];
1926
1927         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1928     }
1929 #endif /* COMPILE_TEMPLATE_MMX */
1930 }
1931
1932 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1933 {
1934 #if COMPILE_TEMPLATE_MMX
1935     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1936 #else
1937     int i;
1938     for (i=0; i<width; i++)
1939     {
1940         int b= src1[3*i + 0];
1941         int g= src1[3*i + 1];
1942         int r= src1[3*i + 2];
1943
1944         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1945         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1946     }
1947 #endif /* COMPILE_TEMPLATE_MMX */
1948     assert(src1 == src2);
1949 }
1950
1951 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1952 {
1953     int i;
1954     for (i=0; i<width; i++)
1955     {
1956         int b= src1[6*i + 0] + src1[6*i + 3];
1957         int g= src1[6*i + 1] + src1[6*i + 4];
1958         int r= src1[6*i + 2] + src1[6*i + 5];
1959
1960         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1961         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1962     }
1963     assert(src1 == src2);
1964 }
1965
1966 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1967 {
1968 #if COMPILE_TEMPLATE_MMX
1969     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1970 #else
1971     int i;
1972     for (i=0; i<width; i++)
1973     {
1974         int r= src[i*3+0];
1975         int g= src[i*3+1];
1976         int b= src[i*3+2];
1977
1978         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1979     }
1980 #endif
1981 }
1982
1983 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1984 {
1985 #if COMPILE_TEMPLATE_MMX
1986     assert(src1==src2);
1987     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1988 #else
1989     int i;
1990     assert(src1==src2);
1991     for (i=0; i<width; i++)
1992     {
1993         int r= src1[3*i + 0];
1994         int g= src1[3*i + 1];
1995         int b= src1[3*i + 2];
1996
1997         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1998         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1999     }
2000 #endif
2001 }
2002
2003 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2004 {
2005     int i;
2006     assert(src1==src2);
2007     for (i=0; i<width; i++)
2008     {
2009         int r= src1[6*i + 0] + src1[6*i + 3];
2010         int g= src1[6*i + 1] + src1[6*i + 4];
2011         int b= src1[6*i + 2] + src1[6*i + 5];
2012
2013         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2014         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2015     }
2016 }
2017
2018
2019 // bilinear / bicubic scaling
2020 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2021                                   const int16_t *filter, const int16_t *filterPos, long filterSize)
2022 {
2023 #if COMPILE_TEMPLATE_MMX
2024     assert(filterSize % 4 == 0 && filterSize>0);
2025     if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2026     {
2027         x86_reg counter= -2*dstW;
2028         filter-= counter*2;
2029         filterPos-= counter/2;
2030         dst-= counter/2;
2031         __asm__ volatile(
2032 #if defined(PIC)
2033         "push            %%"REG_b"              \n\t"
2034 #endif
2035         "pxor                %%mm7, %%mm7       \n\t"
2036         "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2037         "mov             %%"REG_a", %%"REG_BP"  \n\t"
2038         ASMALIGN(4)
2039         "1:                                     \n\t"
2040         "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2041         "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2042         "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2043         "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2044         "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2045         "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2046         "punpcklbw           %%mm7, %%mm0       \n\t"
2047         "punpcklbw           %%mm7, %%mm2       \n\t"
2048         "pmaddwd             %%mm1, %%mm0       \n\t"
2049         "pmaddwd             %%mm2, %%mm3       \n\t"
2050         "movq                %%mm0, %%mm4       \n\t"
2051         "punpckldq           %%mm3, %%mm0       \n\t"
2052         "punpckhdq           %%mm3, %%mm4       \n\t"
2053         "paddd               %%mm4, %%mm0       \n\t"
2054         "psrad                  $7, %%mm0       \n\t"
2055         "packssdw            %%mm0, %%mm0       \n\t"
2056         "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2057         "add                    $4, %%"REG_BP"  \n\t"
2058         " jnc                   1b              \n\t"
2059
2060         "pop            %%"REG_BP"              \n\t"
2061 #if defined(PIC)
2062         "pop             %%"REG_b"              \n\t"
2063 #endif
2064         : "+a" (counter)
2065         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2066 #if !defined(PIC)
2067         : "%"REG_b
2068 #endif
2069         );
2070     }
2071     else if (filterSize==8)
2072     {
2073         x86_reg counter= -2*dstW;
2074         filter-= counter*4;
2075         filterPos-= counter/2;
2076         dst-= counter/2;
2077         __asm__ volatile(
2078 #if defined(PIC)
2079         "push             %%"REG_b"             \n\t"
2080 #endif
2081         "pxor                 %%mm7, %%mm7      \n\t"
2082         "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2083         "mov              %%"REG_a", %%"REG_BP" \n\t"
2084         ASMALIGN(4)
2085         "1:                                     \n\t"
2086         "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2087         "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2088         "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2089         "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2090         "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2091         "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2092         "punpcklbw            %%mm7, %%mm0      \n\t"
2093         "punpcklbw            %%mm7, %%mm2      \n\t"
2094         "pmaddwd              %%mm1, %%mm0      \n\t"
2095         "pmaddwd              %%mm2, %%mm3      \n\t"
2096
2097         "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2098         "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2099         "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2100         "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2101         "punpcklbw            %%mm7, %%mm4      \n\t"
2102         "punpcklbw            %%mm7, %%mm2      \n\t"
2103         "pmaddwd              %%mm1, %%mm4      \n\t"
2104         "pmaddwd              %%mm2, %%mm5      \n\t"
2105         "paddd                %%mm4, %%mm0      \n\t"
2106         "paddd                %%mm5, %%mm3      \n\t"
2107         "movq                 %%mm0, %%mm4      \n\t"
2108         "punpckldq            %%mm3, %%mm0      \n\t"
2109         "punpckhdq            %%mm3, %%mm4      \n\t"
2110         "paddd                %%mm4, %%mm0      \n\t"
2111         "psrad                   $7, %%mm0      \n\t"
2112         "packssdw             %%mm0, %%mm0      \n\t"
2113         "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2114         "add                     $4, %%"REG_BP" \n\t"
2115         " jnc                    1b             \n\t"
2116
2117         "pop             %%"REG_BP"             \n\t"
2118 #if defined(PIC)
2119         "pop              %%"REG_b"             \n\t"
2120 #endif
2121         : "+a" (counter)
2122         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2123 #if !defined(PIC)
2124         : "%"REG_b
2125 #endif
2126         );
2127     }
2128     else
2129     {
2130         uint8_t *offset = src+filterSize;
2131         x86_reg counter= -2*dstW;
2132         //filter-= counter*filterSize/2;
2133         filterPos-= counter/2;
2134         dst-= counter/2;
2135         __asm__ volatile(
2136         "pxor                  %%mm7, %%mm7     \n\t"
2137         ASMALIGN(4)
2138         "1:                                     \n\t"
2139         "mov                      %2, %%"REG_c" \n\t"
2140         "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2141         "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2142         "mov                      %5, %%"REG_c" \n\t"
2143         "pxor                  %%mm4, %%mm4     \n\t"
2144         "pxor                  %%mm5, %%mm5     \n\t"
2145         "2:                                     \n\t"
2146         "movq                   (%1), %%mm1     \n\t"
2147         "movq               (%1, %6), %%mm3     \n\t"
2148         "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2149         "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2150         "punpcklbw             %%mm7, %%mm0     \n\t"
2151         "punpcklbw             %%mm7, %%mm2     \n\t"
2152         "pmaddwd               %%mm1, %%mm0     \n\t"
2153         "pmaddwd               %%mm2, %%mm3     \n\t"
2154         "paddd                 %%mm3, %%mm5     \n\t"
2155         "paddd                 %%mm0, %%mm4     \n\t"
2156         "add                      $8, %1        \n\t"
2157         "add                      $4, %%"REG_c" \n\t"
2158         "cmp                      %4, %%"REG_c" \n\t"
2159         " jb                      2b            \n\t"
2160         "add                      %6, %1        \n\t"
2161         "movq                  %%mm4, %%mm0     \n\t"
2162         "punpckldq             %%mm5, %%mm4     \n\t"
2163         "punpckhdq             %%mm5, %%mm0     \n\t"
2164         "paddd                 %%mm0, %%mm4     \n\t"
2165         "psrad                    $7, %%mm4     \n\t"
2166         "packssdw              %%mm4, %%mm4     \n\t"
2167         "mov                      %3, %%"REG_a" \n\t"
2168         "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2169         "add                      $4, %0        \n\t"
2170         " jnc                     1b            \n\t"
2171
2172         : "+r" (counter), "+r" (filter)
2173         : "m" (filterPos), "m" (dst), "m"(offset),
2174           "m" (src), "r" ((x86_reg)filterSize*2)
2175         : "%"REG_a, "%"REG_c, "%"REG_d
2176         );
2177     }
2178 #else
2179 #if COMPILE_TEMPLATE_ALTIVEC
2180     hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2181 #else
2182     int i;
2183     for (i=0; i<dstW; i++)
2184     {
2185         int j;
2186         int srcPos= filterPos[i];
2187         int val=0;
2188         //printf("filterPos: %d\n", filterPos[i]);
2189         for (j=0; j<filterSize; j++)
2190         {
2191             //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2192             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2193         }
2194         //filter += hFilterSize;
2195         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2196         //dst[i] = val>>7;
2197     }
2198 #endif /* COMPILE_ALTIVEC */
2199 #endif /* COMPILE_MMX */
2200 }
2201
2202 #define FAST_BILINEAR_X86 \
2203     "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2204     "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2205     "shll      $16, %%edi    \n\t"                                              \
2206     "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2207     "mov        %1, %%"REG_D"\n\t"                                              \