00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/h264dsp.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "libavcodec/simple_idct.h"
00031 #include "libavcodec/ac3dec.h"
00032 #include "dsputil_mmx.h"
00033 #include "idct_xvid.h"
00034
00035
00036
00037
00038
00039 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
00040 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00041
00042 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
00043 {0x8000000080000000ULL, 0x8000000080000000ULL};
00044
00045 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
00046 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
00047 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
00048 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
00049 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
00050 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
00051 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
00052 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
00053 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
00054 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
00055 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
00056 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
00057 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
00058 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
00059 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
00060 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
00061 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
00062 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
00063 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
00064 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
00065 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
00066
00067 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
00068 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
00069 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
00070 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
00071 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
00072 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
00073 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
00074 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
00075 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
00076 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
00077 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
00078 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
00079 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
00080
00081 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
00082 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
00083
00084 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
00085 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
00086
00087 #define MOVQ_BFE(regd) \
00088 __asm__ volatile ( \
00089 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
00090 "paddb %%" #regd ", %%" #regd " \n\t" ::)
00091
00092 #ifndef PIC
00093 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
00094 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
00095 #else
00096
00097
00098 #define MOVQ_BONE(regd) \
00099 __asm__ volatile ( \
00100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00101 "psrlw $15, %%" #regd " \n\t" \
00102 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
00103
00104 #define MOVQ_WTWO(regd) \
00105 __asm__ volatile ( \
00106 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00107 "psrlw $15, %%" #regd " \n\t" \
00108 "psllw $1, %%" #regd " \n\t"::)
00109
00110 #endif
00111
00112
00113
00114
00115 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00116 "movq " #rega ", " #regr " \n\t"\
00117 "pand " #regb ", " #regr " \n\t"\
00118 "pxor " #rega ", " #regb " \n\t"\
00119 "pand " #regfe "," #regb " \n\t"\
00120 "psrlq $1, " #regb " \n\t"\
00121 "paddb " #regb ", " #regr " \n\t"
00122
00123 #define PAVGB_MMX(rega, regb, regr, regfe) \
00124 "movq " #rega ", " #regr " \n\t"\
00125 "por " #regb ", " #regr " \n\t"\
00126 "pxor " #rega ", " #regb " \n\t"\
00127 "pand " #regfe "," #regb " \n\t"\
00128 "psrlq $1, " #regb " \n\t"\
00129 "psubb " #regb ", " #regr " \n\t"
00130
00131
00132 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00133 "movq " #rega ", " #regr " \n\t"\
00134 "movq " #regc ", " #regp " \n\t"\
00135 "pand " #regb ", " #regr " \n\t"\
00136 "pand " #regd ", " #regp " \n\t"\
00137 "pxor " #rega ", " #regb " \n\t"\
00138 "pxor " #regc ", " #regd " \n\t"\
00139 "pand %%mm6, " #regb " \n\t"\
00140 "pand %%mm6, " #regd " \n\t"\
00141 "psrlq $1, " #regb " \n\t"\
00142 "psrlq $1, " #regd " \n\t"\
00143 "paddb " #regb ", " #regr " \n\t"\
00144 "paddb " #regd ", " #regp " \n\t"
00145
00146 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00147 "movq " #rega ", " #regr " \n\t"\
00148 "movq " #regc ", " #regp " \n\t"\
00149 "por " #regb ", " #regr " \n\t"\
00150 "por " #regd ", " #regp " \n\t"\
00151 "pxor " #rega ", " #regb " \n\t"\
00152 "pxor " #regc ", " #regd " \n\t"\
00153 "pand %%mm6, " #regb " \n\t"\
00154 "pand %%mm6, " #regd " \n\t"\
00155 "psrlq $1, " #regd " \n\t"\
00156 "psrlq $1, " #regb " \n\t"\
00157 "psubb " #regb ", " #regr " \n\t"\
00158 "psubb " #regd ", " #regp " \n\t"
00159
00160
00161
00162 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
00163 #define SET_RND MOVQ_WONE
00164 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00165 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00166 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
00167
00168 #include "dsputil_mmx_rnd_template.c"
00169
00170 #undef DEF
00171 #undef SET_RND
00172 #undef PAVGBP
00173 #undef PAVGB
00174
00175
00176
00177 #define DEF(x, y) x ## _ ## y ##_mmx
00178 #define SET_RND MOVQ_WTWO
00179 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00180 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00181
00182 #include "dsputil_mmx_rnd_template.c"
00183
00184 #undef DEF
00185 #undef SET_RND
00186 #undef PAVGBP
00187 #undef PAVGB
00188 #undef OP_AVG
00189
00190
00191
00192
00193 #define DEF(x) x ## _3dnow
00194 #define PAVGB "pavgusb"
00195 #define OP_AVG PAVGB
00196
00197 #include "dsputil_mmx_avg_template.c"
00198
00199 #undef DEF
00200 #undef PAVGB
00201 #undef OP_AVG
00202
00203
00204
00205
00206 #define DEF(x) x ## _mmx2
00207
00208
00209 #define PAVGB "pavgb"
00210 #define OP_AVG PAVGB
00211
00212 #include "dsputil_mmx_avg_template.c"
00213
00214 #undef DEF
00215 #undef PAVGB
00216 #undef OP_AVG
00217
00218 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00219 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00220 #define put_pixels16_mmx2 put_pixels16_mmx
00221 #define put_pixels8_mmx2 put_pixels8_mmx
00222 #define put_pixels4_mmx2 put_pixels4_mmx
00223 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
00224 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
00225 #define put_pixels16_3dnow put_pixels16_mmx
00226 #define put_pixels8_3dnow put_pixels8_mmx
00227 #define put_pixels4_3dnow put_pixels4_mmx
00228 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
00229 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
00230
00231
00232
00233
00234 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00235 {
00236 const DCTELEM *p;
00237 uint8_t *pix;
00238
00239
00240 p = block;
00241 pix = pixels;
00242
00243 __asm__ volatile(
00244 "movq %3, %%mm0 \n\t"
00245 "movq 8%3, %%mm1 \n\t"
00246 "movq 16%3, %%mm2 \n\t"
00247 "movq 24%3, %%mm3 \n\t"
00248 "movq 32%3, %%mm4 \n\t"
00249 "movq 40%3, %%mm5 \n\t"
00250 "movq 48%3, %%mm6 \n\t"
00251 "movq 56%3, %%mm7 \n\t"
00252 "packuswb %%mm1, %%mm0 \n\t"
00253 "packuswb %%mm3, %%mm2 \n\t"
00254 "packuswb %%mm5, %%mm4 \n\t"
00255 "packuswb %%mm7, %%mm6 \n\t"
00256 "movq %%mm0, (%0) \n\t"
00257 "movq %%mm2, (%0, %1) \n\t"
00258 "movq %%mm4, (%0, %1, 2) \n\t"
00259 "movq %%mm6, (%0, %2) \n\t"
00260 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
00261 :"memory");
00262 pix += line_size*4;
00263 p += 32;
00264
00265
00266
00267
00268 __asm__ volatile(
00269 "movq (%3), %%mm0 \n\t"
00270 "movq 8(%3), %%mm1 \n\t"
00271 "movq 16(%3), %%mm2 \n\t"
00272 "movq 24(%3), %%mm3 \n\t"
00273 "movq 32(%3), %%mm4 \n\t"
00274 "movq 40(%3), %%mm5 \n\t"
00275 "movq 48(%3), %%mm6 \n\t"
00276 "movq 56(%3), %%mm7 \n\t"
00277 "packuswb %%mm1, %%mm0 \n\t"
00278 "packuswb %%mm3, %%mm2 \n\t"
00279 "packuswb %%mm5, %%mm4 \n\t"
00280 "packuswb %%mm7, %%mm6 \n\t"
00281 "movq %%mm0, (%0) \n\t"
00282 "movq %%mm2, (%0, %1) \n\t"
00283 "movq %%mm4, (%0, %1, 2) \n\t"
00284 "movq %%mm6, (%0, %2) \n\t"
00285 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
00286 :"memory");
00287 }
00288
00289 #define put_signed_pixels_clamped_mmx_half(off) \
00290 "movq "#off"(%2), %%mm1 \n\t"\
00291 "movq 16+"#off"(%2), %%mm2 \n\t"\
00292 "movq 32+"#off"(%2), %%mm3 \n\t"\
00293 "movq 48+"#off"(%2), %%mm4 \n\t"\
00294 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
00295 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
00296 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
00297 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
00298 "paddb %%mm0, %%mm1 \n\t"\
00299 "paddb %%mm0, %%mm2 \n\t"\
00300 "paddb %%mm0, %%mm3 \n\t"\
00301 "paddb %%mm0, %%mm4 \n\t"\
00302 "movq %%mm1, (%0) \n\t"\
00303 "movq %%mm2, (%0, %3) \n\t"\
00304 "movq %%mm3, (%0, %3, 2) \n\t"\
00305 "movq %%mm4, (%0, %1) \n\t"
00306
00307 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00308 {
00309 x86_reg line_skip = line_size;
00310 x86_reg line_skip3;
00311
00312 __asm__ volatile (
00313 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
00314 "lea (%3, %3, 2), %1 \n\t"
00315 put_signed_pixels_clamped_mmx_half(0)
00316 "lea (%0, %3, 4), %0 \n\t"
00317 put_signed_pixels_clamped_mmx_half(64)
00318 :"+&r" (pixels), "=&r" (line_skip3)
00319 :"r" (block), "r"(line_skip)
00320 :"memory");
00321 }
00322
00323 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00324 {
00325 const DCTELEM *p;
00326 uint8_t *pix;
00327 int i;
00328
00329
00330 p = block;
00331 pix = pixels;
00332 MOVQ_ZERO(mm7);
00333 i = 4;
00334 do {
00335 __asm__ volatile(
00336 "movq (%2), %%mm0 \n\t"
00337 "movq 8(%2), %%mm1 \n\t"
00338 "movq 16(%2), %%mm2 \n\t"
00339 "movq 24(%2), %%mm3 \n\t"
00340 "movq %0, %%mm4 \n\t"
00341 "movq %1, %%mm6 \n\t"
00342 "movq %%mm4, %%mm5 \n\t"
00343 "punpcklbw %%mm7, %%mm4 \n\t"
00344 "punpckhbw %%mm7, %%mm5 \n\t"
00345 "paddsw %%mm4, %%mm0 \n\t"
00346 "paddsw %%mm5, %%mm1 \n\t"
00347 "movq %%mm6, %%mm5 \n\t"
00348 "punpcklbw %%mm7, %%mm6 \n\t"
00349 "punpckhbw %%mm7, %%mm5 \n\t"
00350 "paddsw %%mm6, %%mm2 \n\t"
00351 "paddsw %%mm5, %%mm3 \n\t"
00352 "packuswb %%mm1, %%mm0 \n\t"
00353 "packuswb %%mm3, %%mm2 \n\t"
00354 "movq %%mm0, %0 \n\t"
00355 "movq %%mm2, %1 \n\t"
00356 :"+m"(*pix), "+m"(*(pix+line_size))
00357 :"r"(p)
00358 :"memory");
00359 pix += line_size*2;
00360 p += 16;
00361 } while (--i);
00362 }
00363
00364 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00365 {
00366 __asm__ volatile(
00367 "lea (%3, %3), %%"REG_a" \n\t"
00368 ".p2align 3 \n\t"
00369 "1: \n\t"
00370 "movd (%1), %%mm0 \n\t"
00371 "movd (%1, %3), %%mm1 \n\t"
00372 "movd %%mm0, (%2) \n\t"
00373 "movd %%mm1, (%2, %3) \n\t"
00374 "add %%"REG_a", %1 \n\t"
00375 "add %%"REG_a", %2 \n\t"
00376 "movd (%1), %%mm0 \n\t"
00377 "movd (%1, %3), %%mm1 \n\t"
00378 "movd %%mm0, (%2) \n\t"
00379 "movd %%mm1, (%2, %3) \n\t"
00380 "add %%"REG_a", %1 \n\t"
00381 "add %%"REG_a", %2 \n\t"
00382 "subl $4, %0 \n\t"
00383 "jnz 1b \n\t"
00384 : "+g"(h), "+r" (pixels), "+r" (block)
00385 : "r"((x86_reg)line_size)
00386 : "%"REG_a, "memory"
00387 );
00388 }
00389
00390 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00391 {
00392 __asm__ volatile(
00393 "lea (%3, %3), %%"REG_a" \n\t"
00394 ".p2align 3 \n\t"
00395 "1: \n\t"
00396 "movq (%1), %%mm0 \n\t"
00397 "movq (%1, %3), %%mm1 \n\t"
00398 "movq %%mm0, (%2) \n\t"
00399 "movq %%mm1, (%2, %3) \n\t"
00400 "add %%"REG_a", %1 \n\t"
00401 "add %%"REG_a", %2 \n\t"
00402 "movq (%1), %%mm0 \n\t"
00403 "movq (%1, %3), %%mm1 \n\t"
00404 "movq %%mm0, (%2) \n\t"
00405 "movq %%mm1, (%2, %3) \n\t"
00406 "add %%"REG_a", %1 \n\t"
00407 "add %%"REG_a", %2 \n\t"
00408 "subl $4, %0 \n\t"
00409 "jnz 1b \n\t"
00410 : "+g"(h), "+r" (pixels), "+r" (block)
00411 : "r"((x86_reg)line_size)
00412 : "%"REG_a, "memory"
00413 );
00414 }
00415
00416 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00417 {
00418 __asm__ volatile(
00419 "lea (%3, %3), %%"REG_a" \n\t"
00420 ".p2align 3 \n\t"
00421 "1: \n\t"
00422 "movq (%1), %%mm0 \n\t"
00423 "movq 8(%1), %%mm4 \n\t"
00424 "movq (%1, %3), %%mm1 \n\t"
00425 "movq 8(%1, %3), %%mm5 \n\t"
00426 "movq %%mm0, (%2) \n\t"
00427 "movq %%mm4, 8(%2) \n\t"
00428 "movq %%mm1, (%2, %3) \n\t"
00429 "movq %%mm5, 8(%2, %3) \n\t"
00430 "add %%"REG_a", %1 \n\t"
00431 "add %%"REG_a", %2 \n\t"
00432 "movq (%1), %%mm0 \n\t"
00433 "movq 8(%1), %%mm4 \n\t"
00434 "movq (%1, %3), %%mm1 \n\t"
00435 "movq 8(%1, %3), %%mm5 \n\t"
00436 "movq %%mm0, (%2) \n\t"
00437 "movq %%mm4, 8(%2) \n\t"
00438 "movq %%mm1, (%2, %3) \n\t"
00439 "movq %%mm5, 8(%2, %3) \n\t"
00440 "add %%"REG_a", %1 \n\t"
00441 "add %%"REG_a", %2 \n\t"
00442 "subl $4, %0 \n\t"
00443 "jnz 1b \n\t"
00444 : "+g"(h), "+r" (pixels), "+r" (block)
00445 : "r"((x86_reg)line_size)
00446 : "%"REG_a, "memory"
00447 );
00448 }
00449
00450 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00451 {
00452 __asm__ volatile(
00453 "1: \n\t"
00454 "movdqu (%1), %%xmm0 \n\t"
00455 "movdqu (%1,%3), %%xmm1 \n\t"
00456 "movdqu (%1,%3,2), %%xmm2 \n\t"
00457 "movdqu (%1,%4), %%xmm3 \n\t"
00458 "movdqa %%xmm0, (%2) \n\t"
00459 "movdqa %%xmm1, (%2,%3) \n\t"
00460 "movdqa %%xmm2, (%2,%3,2) \n\t"
00461 "movdqa %%xmm3, (%2,%4) \n\t"
00462 "subl $4, %0 \n\t"
00463 "lea (%1,%3,4), %1 \n\t"
00464 "lea (%2,%3,4), %2 \n\t"
00465 "jnz 1b \n\t"
00466 : "+g"(h), "+r" (pixels), "+r" (block)
00467 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00468 : "memory"
00469 );
00470 }
00471
00472 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00473 {
00474 __asm__ volatile(
00475 "1: \n\t"
00476 "movdqu (%1), %%xmm0 \n\t"
00477 "movdqu (%1,%3), %%xmm1 \n\t"
00478 "movdqu (%1,%3,2), %%xmm2 \n\t"
00479 "movdqu (%1,%4), %%xmm3 \n\t"
00480 "pavgb (%2), %%xmm0 \n\t"
00481 "pavgb (%2,%3), %%xmm1 \n\t"
00482 "pavgb (%2,%3,2), %%xmm2 \n\t"
00483 "pavgb (%2,%4), %%xmm3 \n\t"
00484 "movdqa %%xmm0, (%2) \n\t"
00485 "movdqa %%xmm1, (%2,%3) \n\t"
00486 "movdqa %%xmm2, (%2,%3,2) \n\t"
00487 "movdqa %%xmm3, (%2,%4) \n\t"
00488 "subl $4, %0 \n\t"
00489 "lea (%1,%3,4), %1 \n\t"
00490 "lea (%2,%3,4), %2 \n\t"
00491 "jnz 1b \n\t"
00492 : "+g"(h), "+r" (pixels), "+r" (block)
00493 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00494 : "memory"
00495 );
00496 }
00497
00498 #define CLEAR_BLOCKS(name,n) \
00499 static void name(DCTELEM *blocks)\
00500 {\
00501 __asm__ volatile(\
00502 "pxor %%mm7, %%mm7 \n\t"\
00503 "mov %1, %%"REG_a" \n\t"\
00504 "1: \n\t"\
00505 "movq %%mm7, (%0, %%"REG_a") \n\t"\
00506 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
00507 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
00508 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
00509 "add $32, %%"REG_a" \n\t"\
00510 " js 1b \n\t"\
00511 : : "r" (((uint8_t *)blocks)+128*n),\
00512 "i" (-128*n)\
00513 : "%"REG_a\
00514 );\
00515 }
00516 CLEAR_BLOCKS(clear_blocks_mmx, 6)
00517 CLEAR_BLOCKS(clear_block_mmx, 1)
00518
00519 static void clear_block_sse(DCTELEM *block)
00520 {
00521 __asm__ volatile(
00522 "xorps %%xmm0, %%xmm0 \n"
00523 "movaps %%xmm0, (%0) \n"
00524 "movaps %%xmm0, 16(%0) \n"
00525 "movaps %%xmm0, 32(%0) \n"
00526 "movaps %%xmm0, 48(%0) \n"
00527 "movaps %%xmm0, 64(%0) \n"
00528 "movaps %%xmm0, 80(%0) \n"
00529 "movaps %%xmm0, 96(%0) \n"
00530 "movaps %%xmm0, 112(%0) \n"
00531 :: "r"(block)
00532 : "memory"
00533 );
00534 }
00535
00536 static void clear_blocks_sse(DCTELEM *blocks)
00537 {\
00538 __asm__ volatile(
00539 "xorps %%xmm0, %%xmm0 \n"
00540 "mov %1, %%"REG_a" \n"
00541 "1: \n"
00542 "movaps %%xmm0, (%0, %%"REG_a") \n"
00543 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
00544 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
00545 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
00546 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
00547 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
00548 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
00549 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
00550 "add $128, %%"REG_a" \n"
00551 " js 1b \n"
00552 : : "r" (((uint8_t *)blocks)+128*6),
00553 "i" (-128*6)
00554 : "%"REG_a
00555 );
00556 }
00557
00558 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
00559 x86_reg i=0;
00560 __asm__ volatile(
00561 "jmp 2f \n\t"
00562 "1: \n\t"
00563 "movq (%1, %0), %%mm0 \n\t"
00564 "movq (%2, %0), %%mm1 \n\t"
00565 "paddb %%mm0, %%mm1 \n\t"
00566 "movq %%mm1, (%2, %0) \n\t"
00567 "movq 8(%1, %0), %%mm0 \n\t"
00568 "movq 8(%2, %0), %%mm1 \n\t"
00569 "paddb %%mm0, %%mm1 \n\t"
00570 "movq %%mm1, 8(%2, %0) \n\t"
00571 "add $16, %0 \n\t"
00572 "2: \n\t"
00573 "cmp %3, %0 \n\t"
00574 " js 1b \n\t"
00575 : "+r" (i)
00576 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
00577 );
00578 for(; i<w; i++)
00579 dst[i+0] += src[i+0];
00580 }
00581
00582 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
00583 x86_reg i=0;
00584 __asm__ volatile(
00585 "jmp 2f \n\t"
00586 "1: \n\t"
00587 "movq (%2, %0), %%mm0 \n\t"
00588 "movq 8(%2, %0), %%mm1 \n\t"
00589 "paddb (%3, %0), %%mm0 \n\t"
00590 "paddb 8(%3, %0), %%mm1 \n\t"
00591 "movq %%mm0, (%1, %0) \n\t"
00592 "movq %%mm1, 8(%1, %0) \n\t"
00593 "add $16, %0 \n\t"
00594 "2: \n\t"
00595 "cmp %4, %0 \n\t"
00596 " js 1b \n\t"
00597 : "+r" (i)
00598 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
00599 );
00600 for(; i<w; i++)
00601 dst[i] = src1[i] + src2[i];
00602 }
00603
00604 #if HAVE_7REGS && HAVE_TEN_OPERANDS
00605 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
00606 x86_reg w2 = -w;
00607 x86_reg x;
00608 int l = *left & 0xff;
00609 int tl = *left_top & 0xff;
00610 int t;
00611 __asm__ volatile(
00612 "mov %7, %3 \n"
00613 "1: \n"
00614 "movzbl (%3,%4), %2 \n"
00615 "mov %2, %k3 \n"
00616 "sub %b1, %b3 \n"
00617 "add %b0, %b3 \n"
00618 "mov %2, %1 \n"
00619 "cmp %0, %2 \n"
00620 "cmovg %0, %2 \n"
00621 "cmovg %1, %0 \n"
00622 "cmp %k3, %0 \n"
00623 "cmovg %k3, %0 \n"
00624 "mov %7, %3 \n"
00625 "cmp %2, %0 \n"
00626 "cmovl %2, %0 \n"
00627 "add (%6,%4), %b0 \n"
00628 "mov %b0, (%5,%4) \n"
00629 "inc %4 \n"
00630 "jl 1b \n"
00631 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
00632 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
00633 );
00634 *left = l;
00635 *left_top = tl;
00636 }
00637 #endif
00638
00639 #define H263_LOOP_FILTER \
00640 "pxor %%mm7, %%mm7 \n\t"\
00641 "movq %0, %%mm0 \n\t"\
00642 "movq %0, %%mm1 \n\t"\
00643 "movq %3, %%mm2 \n\t"\
00644 "movq %3, %%mm3 \n\t"\
00645 "punpcklbw %%mm7, %%mm0 \n\t"\
00646 "punpckhbw %%mm7, %%mm1 \n\t"\
00647 "punpcklbw %%mm7, %%mm2 \n\t"\
00648 "punpckhbw %%mm7, %%mm3 \n\t"\
00649 "psubw %%mm2, %%mm0 \n\t"\
00650 "psubw %%mm3, %%mm1 \n\t"\
00651 "movq %1, %%mm2 \n\t"\
00652 "movq %1, %%mm3 \n\t"\
00653 "movq %2, %%mm4 \n\t"\
00654 "movq %2, %%mm5 \n\t"\
00655 "punpcklbw %%mm7, %%mm2 \n\t"\
00656 "punpckhbw %%mm7, %%mm3 \n\t"\
00657 "punpcklbw %%mm7, %%mm4 \n\t"\
00658 "punpckhbw %%mm7, %%mm5 \n\t"\
00659 "psubw %%mm2, %%mm4 \n\t"\
00660 "psubw %%mm3, %%mm5 \n\t"\
00661 "psllw $2, %%mm4 \n\t"\
00662 "psllw $2, %%mm5 \n\t"\
00663 "paddw %%mm0, %%mm4 \n\t"\
00664 "paddw %%mm1, %%mm5 \n\t"\
00665 "pxor %%mm6, %%mm6 \n\t"\
00666 "pcmpgtw %%mm4, %%mm6 \n\t"\
00667 "pcmpgtw %%mm5, %%mm7 \n\t"\
00668 "pxor %%mm6, %%mm4 \n\t"\
00669 "pxor %%mm7, %%mm5 \n\t"\
00670 "psubw %%mm6, %%mm4 \n\t"\
00671 "psubw %%mm7, %%mm5 \n\t"\
00672 "psrlw $3, %%mm4 \n\t"\
00673 "psrlw $3, %%mm5 \n\t"\
00674 "packuswb %%mm5, %%mm4 \n\t"\
00675 "packsswb %%mm7, %%mm6 \n\t"\
00676 "pxor %%mm7, %%mm7 \n\t"\
00677 "movd %4, %%mm2 \n\t"\
00678 "punpcklbw %%mm2, %%mm2 \n\t"\
00679 "punpcklbw %%mm2, %%mm2 \n\t"\
00680 "punpcklbw %%mm2, %%mm2 \n\t"\
00681 "psubusb %%mm4, %%mm2 \n\t"\
00682 "movq %%mm2, %%mm3 \n\t"\
00683 "psubusb %%mm4, %%mm3 \n\t"\
00684 "psubb %%mm3, %%mm2 \n\t"\
00685 "movq %1, %%mm3 \n\t"\
00686 "movq %2, %%mm4 \n\t"\
00687 "pxor %%mm6, %%mm3 \n\t"\
00688 "pxor %%mm6, %%mm4 \n\t"\
00689 "paddusb %%mm2, %%mm3 \n\t"\
00690 "psubusb %%mm2, %%mm4 \n\t"\
00691 "pxor %%mm6, %%mm3 \n\t"\
00692 "pxor %%mm6, %%mm4 \n\t"\
00693 "paddusb %%mm2, %%mm2 \n\t"\
00694 "packsswb %%mm1, %%mm0 \n\t"\
00695 "pcmpgtb %%mm0, %%mm7 \n\t"\
00696 "pxor %%mm7, %%mm0 \n\t"\
00697 "psubb %%mm7, %%mm0 \n\t"\
00698 "movq %%mm0, %%mm1 \n\t"\
00699 "psubusb %%mm2, %%mm0 \n\t"\
00700 "psubb %%mm0, %%mm1 \n\t"\
00701 "pand %5, %%mm1 \n\t"\
00702 "psrlw $2, %%mm1 \n\t"\
00703 "pxor %%mm7, %%mm1 \n\t"\
00704 "psubb %%mm7, %%mm1 \n\t"\
00705 "movq %0, %%mm5 \n\t"\
00706 "movq %3, %%mm6 \n\t"\
00707 "psubb %%mm1, %%mm5 \n\t"\
00708 "paddb %%mm1, %%mm6 \n\t"
00709
00710 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00711 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00712 const int strength= ff_h263_loop_filter_strength[qscale];
00713
00714 __asm__ volatile(
00715
00716 H263_LOOP_FILTER
00717
00718 "movq %%mm3, %1 \n\t"
00719 "movq %%mm4, %2 \n\t"
00720 "movq %%mm5, %0 \n\t"
00721 "movq %%mm6, %3 \n\t"
00722 : "+m" (*(uint64_t*)(src - 2*stride)),
00723 "+m" (*(uint64_t*)(src - 1*stride)),
00724 "+m" (*(uint64_t*)(src + 0*stride)),
00725 "+m" (*(uint64_t*)(src + 1*stride))
00726 : "g" (2*strength), "m"(ff_pb_FC)
00727 );
00728 }
00729 }
00730
00731 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00732 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00733 const int strength= ff_h263_loop_filter_strength[qscale];
00734 DECLARE_ALIGNED(8, uint64_t, temp)[4];
00735 uint8_t *btemp= (uint8_t*)temp;
00736
00737 src -= 2;
00738
00739 transpose4x4(btemp , src , 8, stride);
00740 transpose4x4(btemp+4, src + 4*stride, 8, stride);
00741 __asm__ volatile(
00742 H263_LOOP_FILTER
00743
00744 : "+m" (temp[0]),
00745 "+m" (temp[1]),
00746 "+m" (temp[2]),
00747 "+m" (temp[3])
00748 : "g" (2*strength), "m"(ff_pb_FC)
00749 );
00750
00751 __asm__ volatile(
00752 "movq %%mm5, %%mm1 \n\t"
00753 "movq %%mm4, %%mm0 \n\t"
00754 "punpcklbw %%mm3, %%mm5 \n\t"
00755 "punpcklbw %%mm6, %%mm4 \n\t"
00756 "punpckhbw %%mm3, %%mm1 \n\t"
00757 "punpckhbw %%mm6, %%mm0 \n\t"
00758 "movq %%mm5, %%mm3 \n\t"
00759 "movq %%mm1, %%mm6 \n\t"
00760 "punpcklwd %%mm4, %%mm5 \n\t"
00761 "punpcklwd %%mm0, %%mm1 \n\t"
00762 "punpckhwd %%mm4, %%mm3 \n\t"
00763 "punpckhwd %%mm0, %%mm6 \n\t"
00764 "movd %%mm5, (%0) \n\t"
00765 "punpckhdq %%mm5, %%mm5 \n\t"
00766 "movd %%mm5, (%0,%2) \n\t"
00767 "movd %%mm3, (%0,%2,2) \n\t"
00768 "punpckhdq %%mm3, %%mm3 \n\t"
00769 "movd %%mm3, (%0,%3) \n\t"
00770 "movd %%mm1, (%1) \n\t"
00771 "punpckhdq %%mm1, %%mm1 \n\t"
00772 "movd %%mm1, (%1,%2) \n\t"
00773 "movd %%mm6, (%1,%2,2) \n\t"
00774 "punpckhdq %%mm6, %%mm6 \n\t"
00775 "movd %%mm6, (%1,%3) \n\t"
00776 :: "r" (src),
00777 "r" (src + 4*stride),
00778 "r" ((x86_reg) stride ),
00779 "r" ((x86_reg)(3*stride))
00780 );
00781 }
00782 }
00783
00784
00785
00786 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
00787 {
00788 uint8_t *ptr, *last_line;
00789 int i;
00790
00791 last_line = buf + (height - 1) * wrap;
00792
00793 ptr = buf;
00794 if(w==8)
00795 {
00796 __asm__ volatile(
00797 "1: \n\t"
00798 "movd (%0), %%mm0 \n\t"
00799 "punpcklbw %%mm0, %%mm0 \n\t"
00800 "punpcklwd %%mm0, %%mm0 \n\t"
00801 "punpckldq %%mm0, %%mm0 \n\t"
00802 "movq %%mm0, -8(%0) \n\t"
00803 "movq -8(%0, %2), %%mm1 \n\t"
00804 "punpckhbw %%mm1, %%mm1 \n\t"
00805 "punpckhwd %%mm1, %%mm1 \n\t"
00806 "punpckhdq %%mm1, %%mm1 \n\t"
00807 "movq %%mm1, (%0, %2) \n\t"
00808 "add %1, %0 \n\t"
00809 "cmp %3, %0 \n\t"
00810 " jb 1b \n\t"
00811 : "+r" (ptr)
00812 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00813 );
00814 }
00815 else
00816 {
00817 __asm__ volatile(
00818 "1: \n\t"
00819 "movd (%0), %%mm0 \n\t"
00820 "punpcklbw %%mm0, %%mm0 \n\t"
00821 "punpcklwd %%mm0, %%mm0 \n\t"
00822 "punpckldq %%mm0, %%mm0 \n\t"
00823 "movq %%mm0, -8(%0) \n\t"
00824 "movq %%mm0, -16(%0) \n\t"
00825 "movq -8(%0, %2), %%mm1 \n\t"
00826 "punpckhbw %%mm1, %%mm1 \n\t"
00827 "punpckhwd %%mm1, %%mm1 \n\t"
00828 "punpckhdq %%mm1, %%mm1 \n\t"
00829 "movq %%mm1, (%0, %2) \n\t"
00830 "movq %%mm1, 8(%0, %2) \n\t"
00831 "add %1, %0 \n\t"
00832 "cmp %3, %0 \n\t"
00833 " jb 1b \n\t"
00834 : "+r" (ptr)
00835 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00836 );
00837 }
00838
00839 for(i=0;i<w;i+=4) {
00840
00841 if (sides&EDGE_TOP) {
00842 ptr= buf - (i + 1) * wrap - w;
00843 __asm__ volatile(
00844 "1: \n\t"
00845 "movq (%1, %0), %%mm0 \n\t"
00846 "movq %%mm0, (%0) \n\t"
00847 "movq %%mm0, (%0, %2) \n\t"
00848 "movq %%mm0, (%0, %2, 2) \n\t"
00849 "movq %%mm0, (%0, %3) \n\t"
00850 "add $8, %0 \n\t"
00851 "cmp %4, %0 \n\t"
00852 " jb 1b \n\t"
00853 : "+r" (ptr)
00854 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
00855 );
00856 }
00857
00858 if (sides&EDGE_BOTTOM) {
00859 ptr= last_line + (i + 1) * wrap - w;
00860 __asm__ volatile(
00861 "1: \n\t"
00862 "movq (%1, %0), %%mm0 \n\t"
00863 "movq %%mm0, (%0) \n\t"
00864 "movq %%mm0, (%0, %2) \n\t"
00865 "movq %%mm0, (%0, %2, 2) \n\t"
00866 "movq %%mm0, (%0, %3) \n\t"
00867 "add $8, %0 \n\t"
00868 "cmp %4, %0 \n\t"
00869 " jb 1b \n\t"
00870 : "+r" (ptr)
00871 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
00872 );
00873 }
00874 }
00875 }
00876
00877 #define PAETH(cpu, abs3)\
00878 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
00879 {\
00880 x86_reg i = -bpp;\
00881 x86_reg end = w-3;\
00882 __asm__ volatile(\
00883 "pxor %%mm7, %%mm7 \n"\
00884 "movd (%1,%0), %%mm0 \n"\
00885 "movd (%2,%0), %%mm1 \n"\
00886 "punpcklbw %%mm7, %%mm0 \n"\
00887 "punpcklbw %%mm7, %%mm1 \n"\
00888 "add %4, %0 \n"\
00889 "1: \n"\
00890 "movq %%mm1, %%mm2 \n"\
00891 "movd (%2,%0), %%mm1 \n"\
00892 "movq %%mm2, %%mm3 \n"\
00893 "punpcklbw %%mm7, %%mm1 \n"\
00894 "movq %%mm2, %%mm4 \n"\
00895 "psubw %%mm1, %%mm3 \n"\
00896 "psubw %%mm0, %%mm4 \n"\
00897 "movq %%mm3, %%mm5 \n"\
00898 "paddw %%mm4, %%mm5 \n"\
00899 abs3\
00900 "movq %%mm4, %%mm6 \n"\
00901 "pminsw %%mm5, %%mm6 \n"\
00902 "pcmpgtw %%mm6, %%mm3 \n"\
00903 "pcmpgtw %%mm5, %%mm4 \n"\
00904 "movq %%mm4, %%mm6 \n"\
00905 "pand %%mm3, %%mm4 \n"\
00906 "pandn %%mm3, %%mm6 \n"\
00907 "pandn %%mm0, %%mm3 \n"\
00908 "movd (%3,%0), %%mm0 \n"\
00909 "pand %%mm1, %%mm6 \n"\
00910 "pand %%mm4, %%mm2 \n"\
00911 "punpcklbw %%mm7, %%mm0 \n"\
00912 "movq %6, %%mm5 \n"\
00913 "paddw %%mm6, %%mm0 \n"\
00914 "paddw %%mm2, %%mm3 \n"\
00915 "paddw %%mm3, %%mm0 \n"\
00916 "pand %%mm5, %%mm0 \n"\
00917 "movq %%mm0, %%mm3 \n"\
00918 "packuswb %%mm3, %%mm3 \n"\
00919 "movd %%mm3, (%1,%0) \n"\
00920 "add %4, %0 \n"\
00921 "cmp %5, %0 \n"\
00922 "jle 1b \n"\
00923 :"+r"(i)\
00924 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
00925 "m"(ff_pw_255)\
00926 :"memory"\
00927 );\
00928 }
00929
00930 #define ABS3_MMX2\
00931 "psubw %%mm5, %%mm7 \n"\
00932 "pmaxsw %%mm7, %%mm5 \n"\
00933 "pxor %%mm6, %%mm6 \n"\
00934 "pxor %%mm7, %%mm7 \n"\
00935 "psubw %%mm3, %%mm6 \n"\
00936 "psubw %%mm4, %%mm7 \n"\
00937 "pmaxsw %%mm6, %%mm3 \n"\
00938 "pmaxsw %%mm7, %%mm4 \n"\
00939 "pxor %%mm7, %%mm7 \n"
00940
00941 #define ABS3_SSSE3\
00942 "pabsw %%mm3, %%mm3 \n"\
00943 "pabsw %%mm4, %%mm4 \n"\
00944 "pabsw %%mm5, %%mm5 \n"
00945
00946 PAETH(mmx2, ABS3_MMX2)
00947 #if HAVE_SSSE3
00948 PAETH(ssse3, ABS3_SSSE3)
00949 #endif
00950
00951 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
00952 "paddw " #m4 ", " #m3 " \n\t" \
00953 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
00954 "pmullw " #m3 ", %%mm4 \n\t" \
00955 "movq "#in7", " #m3 " \n\t" \
00956 "movq "#in0", %%mm5 \n\t" \
00957 "paddw " #m3 ", %%mm5 \n\t" \
00958 "psubw %%mm5, %%mm4 \n\t" \
00959 "movq "#in1", %%mm5 \n\t" \
00960 "movq "#in2", %%mm6 \n\t" \
00961 "paddw " #m6 ", %%mm5 \n\t" \
00962 "paddw " #m5 ", %%mm6 \n\t" \
00963 "paddw %%mm6, %%mm6 \n\t" \
00964 "psubw %%mm6, %%mm5 \n\t" \
00965 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
00966 "paddw " #rnd ", %%mm4 \n\t" \
00967 "paddw %%mm4, %%mm5 \n\t" \
00968 "psraw $5, %%mm5 \n\t"\
00969 "packuswb %%mm5, %%mm5 \n\t"\
00970 OP(%%mm5, out, %%mm7, d)
00971
00972 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
00973 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00974 uint64_t temp;\
00975 \
00976 __asm__ volatile(\
00977 "pxor %%mm7, %%mm7 \n\t"\
00978 "1: \n\t"\
00979 "movq (%0), %%mm0 \n\t" \
00980 "movq %%mm0, %%mm1 \n\t" \
00981 "movq %%mm0, %%mm2 \n\t" \
00982 "punpcklbw %%mm7, %%mm0 \n\t" \
00983 "punpckhbw %%mm7, %%mm1 \n\t" \
00984 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
00985 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
00986 "movq %%mm2, %%mm3 \n\t" \
00987 "movq %%mm2, %%mm4 \n\t" \
00988 "psllq $8, %%mm2 \n\t" \
00989 "psllq $16, %%mm3 \n\t" \
00990 "psllq $24, %%mm4 \n\t" \
00991 "punpckhbw %%mm7, %%mm2 \n\t" \
00992 "punpckhbw %%mm7, %%mm3 \n\t" \
00993 "punpckhbw %%mm7, %%mm4 \n\t" \
00994 "paddw %%mm3, %%mm5 \n\t" \
00995 "paddw %%mm2, %%mm6 \n\t" \
00996 "paddw %%mm5, %%mm5 \n\t" \
00997 "psubw %%mm5, %%mm6 \n\t" \
00998 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
00999 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01000 "paddw %%mm4, %%mm0 \n\t" \
01001 "paddw %%mm1, %%mm5 \n\t" \
01002 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01003 "psubw %%mm5, %%mm0 \n\t" \
01004 "paddw %6, %%mm6 \n\t"\
01005 "paddw %%mm6, %%mm0 \n\t" \
01006 "psraw $5, %%mm0 \n\t"\
01007 "movq %%mm0, %5 \n\t"\
01008 \
01009 \
01010 "movq 5(%0), %%mm0 \n\t" \
01011 "movq %%mm0, %%mm5 \n\t" \
01012 "movq %%mm0, %%mm6 \n\t" \
01013 "psrlq $8, %%mm0 \n\t" \
01014 "psrlq $16, %%mm5 \n\t" \
01015 "punpcklbw %%mm7, %%mm0 \n\t" \
01016 "punpcklbw %%mm7, %%mm5 \n\t" \
01017 "paddw %%mm0, %%mm2 \n\t" \
01018 "paddw %%mm5, %%mm3 \n\t" \
01019 "paddw %%mm2, %%mm2 \n\t" \
01020 "psubw %%mm2, %%mm3 \n\t" \
01021 "movq %%mm6, %%mm2 \n\t" \
01022 "psrlq $24, %%mm6 \n\t" \
01023 "punpcklbw %%mm7, %%mm2 \n\t" \
01024 "punpcklbw %%mm7, %%mm6 \n\t" \
01025 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01026 "paddw %%mm2, %%mm1 \n\t" \
01027 "paddw %%mm6, %%mm4 \n\t" \
01028 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01029 "psubw %%mm4, %%mm3 \n\t" \
01030 "paddw %6, %%mm1 \n\t"\
01031 "paddw %%mm1, %%mm3 \n\t" \
01032 "psraw $5, %%mm3 \n\t"\
01033 "movq %5, %%mm1 \n\t"\
01034 "packuswb %%mm3, %%mm1 \n\t"\
01035 OP_MMX2(%%mm1, (%1),%%mm4, q)\
01036 \
01037 \
01038 "movq 9(%0), %%mm1 \n\t" \
01039 "movq %%mm1, %%mm4 \n\t" \
01040 "movq %%mm1, %%mm3 \n\t" \
01041 "psrlq $8, %%mm1 \n\t" \
01042 "psrlq $16, %%mm4 \n\t" \
01043 "punpcklbw %%mm7, %%mm1 \n\t" \
01044 "punpcklbw %%mm7, %%mm4 \n\t" \
01045 "paddw %%mm1, %%mm5 \n\t" \
01046 "paddw %%mm4, %%mm0 \n\t" \
01047 "paddw %%mm5, %%mm5 \n\t" \
01048 "psubw %%mm5, %%mm0 \n\t" \
01049 "movq %%mm3, %%mm5 \n\t" \
01050 "psrlq $24, %%mm3 \n\t" \
01051 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
01052 "punpcklbw %%mm7, %%mm3 \n\t" \
01053 "paddw %%mm3, %%mm2 \n\t" \
01054 "psubw %%mm2, %%mm0 \n\t" \
01055 "movq %%mm5, %%mm2 \n\t" \
01056 "punpcklbw %%mm7, %%mm2 \n\t" \
01057 "punpckhbw %%mm7, %%mm5 \n\t" \
01058 "paddw %%mm2, %%mm6 \n\t" \
01059 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
01060 "paddw %6, %%mm0 \n\t"\
01061 "paddw %%mm6, %%mm0 \n\t" \
01062 "psraw $5, %%mm0 \n\t"\
01063 \
01064 \
01065 "paddw %%mm5, %%mm3 \n\t" \
01066 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01067 "paddw %%mm4, %%mm6 \n\t" \
01068 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
01069 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01070 "paddw %%mm1, %%mm4 \n\t" \
01071 "paddw %%mm2, %%mm5 \n\t" \
01072 "paddw %%mm6, %%mm6 \n\t" \
01073 "psubw %%mm6, %%mm4 \n\t" \
01074 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
01075 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
01076 "psubw %%mm5, %%mm3 \n\t" \
01077 "paddw %6, %%mm4 \n\t"\
01078 "paddw %%mm3, %%mm4 \n\t" \
01079 "psraw $5, %%mm4 \n\t"\
01080 "packuswb %%mm4, %%mm0 \n\t"\
01081 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
01082 \
01083 "add %3, %0 \n\t"\
01084 "add %4, %1 \n\t"\
01085 "decl %2 \n\t"\
01086 " jnz 1b \n\t"\
01087 : "+a"(src), "+c"(dst), "+D"(h)\
01088 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(temp), "m"(ROUNDER)\
01089 : "memory"\
01090 );\
01091 }\
01092 \
01093 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01094 int i;\
01095 int16_t temp[16];\
01096 \
01097 for(i=0; i<h; i++)\
01098 {\
01099 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01100 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01101 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01102 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01103 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01104 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
01105 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
01106 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
01107 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
01108 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
01109 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
01110 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
01111 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
01112 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
01113 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
01114 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
01115 __asm__ volatile(\
01116 "movq (%0), %%mm0 \n\t"\
01117 "movq 8(%0), %%mm1 \n\t"\
01118 "paddw %2, %%mm0 \n\t"\
01119 "paddw %2, %%mm1 \n\t"\
01120 "psraw $5, %%mm0 \n\t"\
01121 "psraw $5, %%mm1 \n\t"\
01122 "packuswb %%mm1, %%mm0 \n\t"\
01123 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01124 "movq 16(%0), %%mm0 \n\t"\
01125 "movq 24(%0), %%mm1 \n\t"\
01126 "paddw %2, %%mm0 \n\t"\
01127 "paddw %2, %%mm1 \n\t"\
01128 "psraw $5, %%mm0 \n\t"\
01129 "psraw $5, %%mm1 \n\t"\
01130 "packuswb %%mm1, %%mm0 \n\t"\
01131 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
01132 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01133 : "memory"\
01134 );\
01135 dst+=dstStride;\
01136 src+=srcStride;\
01137 }\
01138 }\
01139 \
01140 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01141 __asm__ volatile(\
01142 "pxor %%mm7, %%mm7 \n\t"\
01143 "1: \n\t"\
01144 "movq (%0), %%mm0 \n\t" \
01145 "movq %%mm0, %%mm1 \n\t" \
01146 "movq %%mm0, %%mm2 \n\t" \
01147 "punpcklbw %%mm7, %%mm0 \n\t" \
01148 "punpckhbw %%mm7, %%mm1 \n\t" \
01149 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01150 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01151 "movq %%mm2, %%mm3 \n\t" \
01152 "movq %%mm2, %%mm4 \n\t" \
01153 "psllq $8, %%mm2 \n\t" \
01154 "psllq $16, %%mm3 \n\t" \
01155 "psllq $24, %%mm4 \n\t" \
01156 "punpckhbw %%mm7, %%mm2 \n\t" \
01157 "punpckhbw %%mm7, %%mm3 \n\t" \
01158 "punpckhbw %%mm7, %%mm4 \n\t" \
01159 "paddw %%mm3, %%mm5 \n\t" \
01160 "paddw %%mm2, %%mm6 \n\t" \
01161 "paddw %%mm5, %%mm5 \n\t" \
01162 "psubw %%mm5, %%mm6 \n\t" \
01163 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01164 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01165 "paddw %%mm4, %%mm0 \n\t" \
01166 "paddw %%mm1, %%mm5 \n\t" \
01167 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01168 "psubw %%mm5, %%mm0 \n\t" \
01169 "paddw %5, %%mm6 \n\t"\
01170 "paddw %%mm6, %%mm0 \n\t" \
01171 "psraw $5, %%mm0 \n\t"\
01172 \
01173 \
01174 "movd 5(%0), %%mm5 \n\t" \
01175 "punpcklbw %%mm7, %%mm5 \n\t" \
01176 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01177 "paddw %%mm5, %%mm1 \n\t" \
01178 "paddw %%mm6, %%mm2 \n\t" \
01179 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
01180 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01181 "paddw %%mm6, %%mm3 \n\t" \
01182 "paddw %%mm5, %%mm4 \n\t" \
01183 "paddw %%mm2, %%mm2 \n\t" \
01184 "psubw %%mm2, %%mm3 \n\t" \
01185 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01186 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01187 "psubw %%mm4, %%mm3 \n\t" \
01188 "paddw %5, %%mm1 \n\t"\
01189 "paddw %%mm1, %%mm3 \n\t" \
01190 "psraw $5, %%mm3 \n\t"\
01191 "packuswb %%mm3, %%mm0 \n\t"\
01192 OP_MMX2(%%mm0, (%1), %%mm4, q)\
01193 \
01194 "add %3, %0 \n\t"\
01195 "add %4, %1 \n\t"\
01196 "decl %2 \n\t"\
01197 " jnz 1b \n\t"\
01198 : "+a"(src), "+c"(dst), "+d"(h)\
01199 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ROUNDER)\
01200 : "memory"\
01201 );\
01202 }\
01203 \
01204 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01205 int i;\
01206 int16_t temp[8];\
01207 \
01208 for(i=0; i<h; i++)\
01209 {\
01210 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01211 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01212 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01213 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01214 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01215 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
01216 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
01217 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
01218 __asm__ volatile(\
01219 "movq (%0), %%mm0 \n\t"\
01220 "movq 8(%0), %%mm1 \n\t"\
01221 "paddw %2, %%mm0 \n\t"\
01222 "paddw %2, %%mm1 \n\t"\
01223 "psraw $5, %%mm0 \n\t"\
01224 "psraw $5, %%mm1 \n\t"\
01225 "packuswb %%mm1, %%mm0 \n\t"\
01226 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01227 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01228 :"memory"\
01229 );\
01230 dst+=dstStride;\
01231 src+=srcStride;\
01232 }\
01233 }
01234
01235 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
01236 \
01237 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01238 uint64_t temp[17*4];\
01239 uint64_t *temp_ptr= temp;\
01240 int count= 17;\
01241 \
01242 \
01243 __asm__ volatile(\
01244 "pxor %%mm7, %%mm7 \n\t"\
01245 "1: \n\t"\
01246 "movq (%0), %%mm0 \n\t"\
01247 "movq (%0), %%mm1 \n\t"\
01248 "movq 8(%0), %%mm2 \n\t"\
01249 "movq 8(%0), %%mm3 \n\t"\
01250 "punpcklbw %%mm7, %%mm0 \n\t"\
01251 "punpckhbw %%mm7, %%mm1 \n\t"\
01252 "punpcklbw %%mm7, %%mm2 \n\t"\
01253 "punpckhbw %%mm7, %%mm3 \n\t"\
01254 "movq %%mm0, (%1) \n\t"\
01255 "movq %%mm1, 17*8(%1) \n\t"\
01256 "movq %%mm2, 2*17*8(%1) \n\t"\
01257 "movq %%mm3, 3*17*8(%1) \n\t"\
01258 "add $8, %1 \n\t"\
01259 "add %3, %0 \n\t"\
01260 "decl %2 \n\t"\
01261 " jnz 1b \n\t"\
01262 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
01263 : "r" ((x86_reg)srcStride)\
01264 : "memory"\
01265 );\
01266 \
01267 temp_ptr= temp;\
01268 count=4;\
01269 \
01270 \
01271 __asm__ volatile(\
01272 \
01273 "1: \n\t"\
01274 "movq (%0), %%mm0 \n\t"\
01275 "movq 8(%0), %%mm1 \n\t"\
01276 "movq 16(%0), %%mm2 \n\t"\
01277 "movq 24(%0), %%mm3 \n\t"\
01278 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
01279 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
01280 "add %4, %1 \n\t"\
01281 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
01282 \
01283 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
01284 "add %4, %1 \n\t"\
01285 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
01286 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
01287 "add %4, %1 \n\t"\
01288 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
01289 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
01290 "add %4, %1 \n\t"\
01291 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
01292 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
01293 "add %4, %1 \n\t"\
01294 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
01295 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
01296 "add %4, %1 \n\t"\
01297 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
01298 \
01299 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
01300 "add %4, %1 \n\t" \
01301 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
01302 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
01303 \
01304 "add $136, %0 \n\t"\
01305 "add %6, %1 \n\t"\
01306 "decl %2 \n\t"\
01307 " jnz 1b \n\t"\
01308 \
01309 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
01310 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
01311 :"memory"\
01312 );\
01313 }\
01314 \
01315 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01316 uint64_t temp[9*2];\
01317 uint64_t *temp_ptr= temp;\
01318 int count= 9;\
01319 \
01320 \
01321 __asm__ volatile(\
01322 "pxor %%mm7, %%mm7 \n\t"\
01323 "1: \n\t"\
01324 "movq (%0), %%mm0 \n\t"\
01325 "movq (%0), %%mm1 \n\t"\
01326 "punpcklbw %%mm7, %%mm0 \n\t"\
01327 "punpckhbw %%mm7, %%mm1 \n\t"\
01328 "movq %%mm0, (%1) \n\t"\
01329 "movq %%mm1, 9*8(%1) \n\t"\
01330 "add $8, %1 \n\t"\
01331 "add %3, %0 \n\t"\
01332 "decl %2 \n\t"\
01333 " jnz 1b \n\t"\
01334 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
01335 : "r" ((x86_reg)srcStride)\
01336 : "memory"\
01337 );\
01338 \
01339 temp_ptr= temp;\
01340 count=2;\
01341 \
01342 \
01343 __asm__ volatile(\
01344 \
01345 "1: \n\t"\
01346 "movq (%0), %%mm0 \n\t"\
01347 "movq 8(%0), %%mm1 \n\t"\
01348 "movq 16(%0), %%mm2 \n\t"\
01349 "movq 24(%0), %%mm3 \n\t"\
01350 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
01351 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
01352 "add %4, %1 \n\t"\
01353 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
01354 \
01355 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
01356 "add %4, %1 \n\t"\
01357 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
01358 \
01359 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
01360 "add %4, %1 \n\t"\
01361 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
01362 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
01363 \
01364 "add $72, %0 \n\t"\
01365 "add %6, %1 \n\t"\
01366 "decl %2 \n\t"\
01367 " jnz 1b \n\t"\
01368 \
01369 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
01370 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
01371 : "memory"\
01372 );\
01373 }\
01374 \
01375 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01376 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
01377 }\
01378 \
01379 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01380 uint64_t temp[8];\
01381 uint8_t * const half= (uint8_t*)temp;\
01382 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
01383 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
01384 }\
01385 \
01386 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01387 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
01388 }\
01389 \
01390 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01391 uint64_t temp[8];\
01392 uint8_t * const half= (uint8_t*)temp;\
01393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
01394 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
01395 }\
01396 \
01397 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01398 uint64_t temp[8];\
01399 uint8_t * const half= (uint8_t*)temp;\
01400 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
01401 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
01402 }\
01403 \
01404 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01405 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
01406 }\
01407 \
01408 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01409 uint64_t temp[8];\
01410 uint8_t * const half= (uint8_t*)temp;\
01411 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
01412 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
01413 }\
01414 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01415 uint64_t half[8 + 9];\
01416 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01417 uint8_t * const halfHV= ((uint8_t*)half);\
01418 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01419 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01420 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01421 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01422 }\
01423 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01424 uint64_t half[8 + 9];\
01425 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01426 uint8_t * const halfHV= ((uint8_t*)half);\
01427 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01428 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01429 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01430 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01431 }\
01432 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01433 uint64_t half[8 + 9];\
01434 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01435 uint8_t * const halfHV= ((uint8_t*)half);\
01436 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01437 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01438 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01439 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01440 }\
01441 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01442 uint64_t half[8 + 9];\
01443 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01444 uint8_t * const halfHV= ((uint8_t*)half);\
01445 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01446 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01447 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01448 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01449 }\
01450 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01451 uint64_t half[8 + 9];\
01452 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01453 uint8_t * const halfHV= ((uint8_t*)half);\
01454 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01455 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01456 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01457 }\
01458 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01459 uint64_t half[8 + 9];\
01460 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01461 uint8_t * const halfHV= ((uint8_t*)half);\
01462 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01463 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01464 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01465 }\
01466 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01467 uint64_t half[8 + 9];\
01468 uint8_t * const halfH= ((uint8_t*)half);\
01469 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01470 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01471 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01472 }\
01473 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01474 uint64_t half[8 + 9];\
01475 uint8_t * const halfH= ((uint8_t*)half);\
01476 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01477 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01478 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01479 }\
01480 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01481 uint64_t half[9];\
01482 uint8_t * const halfH= ((uint8_t*)half);\
01483 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01484 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01485 }\
01486 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01487 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
01488 }\
01489 \
01490 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01491 uint64_t temp[32];\
01492 uint8_t * const half= (uint8_t*)temp;\
01493 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
01494 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
01495 }\
01496 \
01497 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01498 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
01499 }\
01500 \
01501 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01502 uint64_t temp[32];\
01503 uint8_t * const half= (uint8_t*)temp;\
01504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
01505 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
01506 }\
01507 \
01508 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01509 uint64_t temp[32];\
01510 uint8_t * const half= (uint8_t*)temp;\
01511 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
01512 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
01513 }\
01514 \
01515 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01516 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
01517 }\
01518 \
01519 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01520 uint64_t temp[32];\
01521 uint8_t * const half= (uint8_t*)temp;\
01522 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
01523 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
01524 }\
01525 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01526 uint64_t half[16*2 + 17*2];\
01527 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01528 uint8_t * const halfHV= ((uint8_t*)half);\
01529 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01530 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01531 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01532 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01533 }\
01534 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01535 uint64_t half[16*2 + 17*2];\
01536 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01537 uint8_t * const halfHV= ((uint8_t*)half);\
01538 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01539 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01540 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01541 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01542 }\
01543 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01544 uint64_t half[16*2 + 17*2];\
01545 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01546 uint8_t * const halfHV= ((uint8_t*)half);\
01547 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01548 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01549 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01550 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01551 }\
01552 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01553 uint64_t half[16*2 + 17*2];\
01554 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01555 uint8_t * const halfHV= ((uint8_t*)half);\
01556 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01557 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01558 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01559 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01560 }\
01561 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01562 uint64_t half[16*2 + 17*2];\
01563 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01564 uint8_t * const halfHV= ((uint8_t*)half);\
01565 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01566 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01567 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01568 }\
01569 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01570 uint64_t half[16*2 + 17*2];\
01571 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01572 uint8_t * const halfHV= ((uint8_t*)half);\
01573 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01574 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01575 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01576 }\
01577 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01578 uint64_t half[17*2];\
01579 uint8_t * const halfH= ((uint8_t*)half);\
01580 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01581 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01582 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01583 }\
01584 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01585 uint64_t half[17*2];\
01586 uint8_t * const halfH= ((uint8_t*)half);\
01587 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01588 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01589 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01590 }\
01591 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01592 uint64_t half[17*2];\
01593 uint8_t * const halfH= ((uint8_t*)half);\
01594 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01595 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01596 }
01597
01598 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
01599 #define AVG_3DNOW_OP(a,b,temp, size) \
01600 "mov" #size " " #b ", " #temp " \n\t"\
01601 "pavgusb " #temp ", " #a " \n\t"\
01602 "mov" #size " " #a ", " #b " \n\t"
01603 #define AVG_MMX2_OP(a,b,temp, size) \
01604 "mov" #size " " #b ", " #temp " \n\t"\
01605 "pavgb " #temp ", " #a " \n\t"\
01606 "mov" #size " " #a ", " #b " \n\t"
01607
01608 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
01609 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
01610 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
01611 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
01612 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
01613 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
01614 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
01615 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
01616 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
01617
01618
01619
01620
01621 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
01622 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01623 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
01624 }
01625 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
01626 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01627 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
01628 }
01629
01630 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
01631 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
01632 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
01633 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
01634 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
01635 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
01636 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
01637 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
01638 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
01639 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
01640 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01641 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
01642 }\
01643 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01644 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
01645 }\
01646 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
01647 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
01648 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
01649 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
01650 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
01651 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
01652 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
01653 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
01654
01655 QPEL_2TAP(put_, 16, mmx2)
01656 QPEL_2TAP(avg_, 16, mmx2)
01657 QPEL_2TAP(put_, 8, mmx2)
01658 QPEL_2TAP(avg_, 8, mmx2)
01659 QPEL_2TAP(put_, 16, 3dnow)
01660 QPEL_2TAP(avg_, 16, 3dnow)
01661 QPEL_2TAP(put_, 8, 3dnow)
01662 QPEL_2TAP(avg_, 8, 3dnow)
01663
01664
01665 #if 0
01666 static void just_return(void) { return; }
01667 #endif
01668
01669 #if HAVE_YASM
01670 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
01671 x86_reg linesize, x86_reg start_y,
01672 x86_reg end_y, x86_reg block_h,
01673 x86_reg start_x, x86_reg end_x,
01674 x86_reg block_w);
01675 extern emu_edge_core_func ff_emu_edge_core_mmx;
01676 extern emu_edge_core_func ff_emu_edge_core_sse;
01677
01678 static av_always_inline
01679 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
01680 int block_w, int block_h,
01681 int src_x, int src_y, int w, int h,
01682 emu_edge_core_func *core_fn)
01683 {
01684 int start_y, start_x, end_y, end_x, src_y_add=0;
01685
01686 if(src_y>= h){
01687 src_y_add = h-1-src_y;
01688 src_y=h-1;
01689 }else if(src_y<=-block_h){
01690 src_y_add = 1-block_h-src_y;
01691 src_y=1-block_h;
01692 }
01693 if(src_x>= w){
01694 src+= (w-1-src_x);
01695 src_x=w-1;
01696 }else if(src_x<=-block_w){
01697 src+= (1-block_w-src_x);
01698 src_x=1-block_w;
01699 }
01700
01701 start_y= FFMAX(0, -src_y);
01702 start_x= FFMAX(0, -src_x);
01703 end_y= FFMIN(block_h, h-src_y);
01704 end_x= FFMIN(block_w, w-src_x);
01705 assert(start_x < end_x && block_w > 0);
01706 assert(start_y < end_y && block_h > 0);
01707
01708
01709 src += (src_y_add+start_y)*linesize + start_x;
01710 buf += start_x;
01711 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
01712 }
01713
01714 #if ARCH_X86_32
01715 static av_noinline
01716 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
01717 int block_w, int block_h,
01718 int src_x, int src_y, int w, int h)
01719 {
01720 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01721 w, h, &ff_emu_edge_core_mmx);
01722 }
01723 #endif
01724 static av_noinline
01725 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
01726 int block_w, int block_h,
01727 int src_x, int src_y, int w, int h)
01728 {
01729 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01730 w, h, &ff_emu_edge_core_sse);
01731 }
01732 #endif
01733
01734 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
01735 int linesize, int block_w, int block_h,
01736 int src_x, int src_y, int w, int h);
01737
01738 static av_always_inline
01739 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01740 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
01741 emulated_edge_mc_func *emu_edge_fn)
01742 {
01743 const int w = 8;
01744 const int ix = ox>>(16+shift);
01745 const int iy = oy>>(16+shift);
01746 const int oxs = ox>>4;
01747 const int oys = oy>>4;
01748 const int dxxs = dxx>>4;
01749 const int dxys = dxy>>4;
01750 const int dyxs = dyx>>4;
01751 const int dyys = dyy>>4;
01752 const uint16_t r4[4] = {r,r,r,r};
01753 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
01754 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
01755 const uint64_t shift2 = 2*shift;
01756 uint8_t edge_buf[(h+1)*stride];
01757 int x, y;
01758
01759 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
01760 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
01761 const int dxh = dxy*(h-1);
01762 const int dyw = dyx*(w-1);
01763 if(
01764 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
01765 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
01766
01767 || (dxx|dxy|dyx|dyy)&15 )
01768 {
01769
01770 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
01771 return;
01772 }
01773
01774 src += ix + iy*stride;
01775 if( (unsigned)ix >= width-w ||
01776 (unsigned)iy >= height-h )
01777 {
01778 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
01779 src = edge_buf;
01780 }
01781
01782 __asm__ volatile(
01783 "movd %0, %%mm6 \n\t"
01784 "pxor %%mm7, %%mm7 \n\t"
01785 "punpcklwd %%mm6, %%mm6 \n\t"
01786 "punpcklwd %%mm6, %%mm6 \n\t"
01787 :: "r"(1<<shift)
01788 );
01789
01790 for(x=0; x<w; x+=4){
01791 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
01792 oxs - dxys + dxxs*(x+1),
01793 oxs - dxys + dxxs*(x+2),
01794 oxs - dxys + dxxs*(x+3) };
01795 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
01796 oys - dyys + dyxs*(x+1),
01797 oys - dyys + dyxs*(x+2),
01798 oys - dyys + dyxs*(x+3) };
01799
01800 for(y=0; y<h; y++){
01801 __asm__ volatile(
01802 "movq %0, %%mm4 \n\t"
01803 "movq %1, %%mm5 \n\t"
01804 "paddw %2, %%mm4 \n\t"
01805 "paddw %3, %%mm5 \n\t"
01806 "movq %%mm4, %0 \n\t"
01807 "movq %%mm5, %1 \n\t"
01808 "psrlw $12, %%mm4 \n\t"
01809 "psrlw $12, %%mm5 \n\t"
01810 : "+m"(*dx4), "+m"(*dy4)
01811 : "m"(*dxy4), "m"(*dyy4)
01812 );
01813
01814 __asm__ volatile(
01815 "movq %%mm6, %%mm2 \n\t"
01816 "movq %%mm6, %%mm1 \n\t"
01817 "psubw %%mm4, %%mm2 \n\t"
01818 "psubw %%mm5, %%mm1 \n\t"
01819 "movq %%mm2, %%mm0 \n\t"
01820 "movq %%mm4, %%mm3 \n\t"
01821 "pmullw %%mm1, %%mm0 \n\t"
01822 "pmullw %%mm5, %%mm3 \n\t"
01823 "pmullw %%mm5, %%mm2 \n\t"
01824 "pmullw %%mm4, %%mm1 \n\t"
01825
01826 "movd %4, %%mm5 \n\t"
01827 "movd %3, %%mm4 \n\t"
01828 "punpcklbw %%mm7, %%mm5 \n\t"
01829 "punpcklbw %%mm7, %%mm4 \n\t"
01830 "pmullw %%mm5, %%mm3 \n\t"
01831 "pmullw %%mm4, %%mm2 \n\t"
01832
01833 "movd %2, %%mm5 \n\t"
01834 "movd %1, %%mm4 \n\t"
01835 "punpcklbw %%mm7, %%mm5 \n\t"
01836 "punpcklbw %%mm7, %%mm4 \n\t"
01837 "pmullw %%mm5, %%mm1 \n\t"
01838 "pmullw %%mm4, %%mm0 \n\t"
01839 "paddw %5, %%mm1 \n\t"
01840 "paddw %%mm3, %%mm2 \n\t"
01841 "paddw %%mm1, %%mm0 \n\t"
01842 "paddw %%mm2, %%mm0 \n\t"
01843
01844 "psrlw %6, %%mm0 \n\t"
01845 "packuswb %%mm0, %%mm0 \n\t"
01846 "movd %%mm0, %0 \n\t"
01847
01848 : "=m"(dst[x+y*stride])
01849 : "m"(src[0]), "m"(src[1]),
01850 "m"(src[stride]), "m"(src[stride+1]),
01851 "m"(*r4), "m"(shift2)
01852 );
01853 src += stride;
01854 }
01855 src += 4-h*stride;
01856 }
01857 }
01858
01859 #if HAVE_YASM
01860 #if ARCH_X86_32
01861 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01862 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
01863 {
01864 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01865 width, height, &emulated_edge_mc_mmx);
01866 }
01867 #endif
01868 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01869 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
01870 {
01871 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01872 width, height, &emulated_edge_mc_sse);
01873 }
01874 #else
01875 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01876 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
01877 {
01878 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01879 width, height, &ff_emulated_edge_mc);
01880 }
01881 #endif
01882
01883 #define PREFETCH(name, op) \
01884 static void name(void *mem, int stride, int h){\
01885 const uint8_t *p= mem;\
01886 do{\
01887 __asm__ volatile(#op" %0" :: "m"(*p));\
01888 p+= stride;\
01889 }while(--h);\
01890 }
01891 PREFETCH(prefetch_mmx2, prefetcht0)
01892 PREFETCH(prefetch_3dnow, prefetch)
01893 #undef PREFETCH
01894
01895 #include "h264_qpel_mmx.c"
01896
01897 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
01898 int stride, int h, int x, int y);
01899 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
01900 int stride, int h, int x, int y);
01901 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
01902 int stride, int h, int x, int y);
01903 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
01904 int stride, int h, int x, int y);
01905 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
01906 int stride, int h, int x, int y);
01907 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
01908 int stride, int h, int x, int y);
01909
01910 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
01911 int stride, int h, int x, int y);
01912 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
01913 int stride, int h, int x, int y);
01914 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
01915 int stride, int h, int x, int y);
01916 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
01917 int stride, int h, int x, int y);
01918 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
01919 int stride, int h, int x, int y);
01920 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
01921 int stride, int h, int x, int y);
01922
01923 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
01924 int stride, int h, int x, int y);
01925 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
01926 int stride, int h, int x, int y);
01927
01928 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
01929 int stride, int h, int x, int y);
01930 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
01931 int stride, int h, int x, int y);
01932
01933 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
01934 int stride, int h, int x, int y);
01935 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
01936 int stride, int h, int x, int y);
01937
01938
01939
01940 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01941 put_pixels8_mmx(dst, src, stride, 8);
01942 }
01943 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01944 avg_pixels8_mmx(dst, src, stride, 8);
01945 }
01946 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01947 put_pixels16_mmx(dst, src, stride, 16);
01948 }
01949 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01950 avg_pixels16_mmx(dst, src, stride, 16);
01951 }
01952
01953
01954 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
01955 put_pixels8_mmx(dst, src, stride, 8);
01956 }
01957 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
01958 avg_pixels8_mmx2(dst, src, stride, 8);
01959 }
01960
01961
01962
01963 #if CONFIG_GPL
01964 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
01965 {
01966 ff_mmx_idct (block);
01967 ff_put_pixels_clamped_mmx(block, dest, line_size);
01968 }
01969 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
01970 {
01971 ff_mmx_idct (block);
01972 ff_add_pixels_clamped_mmx(block, dest, line_size);
01973 }
01974 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
01975 {
01976 ff_mmxext_idct (block);
01977 ff_put_pixels_clamped_mmx(block, dest, line_size);
01978 }
01979 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
01980 {
01981 ff_mmxext_idct (block);
01982 ff_add_pixels_clamped_mmx(block, dest, line_size);
01983 }
01984 #endif
01985 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
01986 {
01987 ff_idct_xvid_mmx (block);
01988 ff_put_pixels_clamped_mmx(block, dest, line_size);
01989 }
01990 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
01991 {
01992 ff_idct_xvid_mmx (block);
01993 ff_add_pixels_clamped_mmx(block, dest, line_size);
01994 }
01995 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
01996 {
01997 ff_idct_xvid_mmx2 (block);
01998 ff_put_pixels_clamped_mmx(block, dest, line_size);
01999 }
02000 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
02001 {
02002 ff_idct_xvid_mmx2 (block);
02003 ff_add_pixels_clamped_mmx(block, dest, line_size);
02004 }
02005
02006 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
02007 {
02008 int i;
02009 __asm__ volatile("pxor %%mm7, %%mm7":);
02010 for(i=0; i<blocksize; i+=2) {
02011 __asm__ volatile(
02012 "movq %0, %%mm0 \n\t"
02013 "movq %1, %%mm1 \n\t"
02014 "movq %%mm0, %%mm2 \n\t"
02015 "movq %%mm1, %%mm3 \n\t"
02016 "pfcmpge %%mm7, %%mm2 \n\t"
02017 "pfcmpge %%mm7, %%mm3 \n\t"
02018 "pslld $31, %%mm2 \n\t"
02019 "pxor %%mm2, %%mm1 \n\t"
02020 "movq %%mm3, %%mm4 \n\t"
02021 "pand %%mm1, %%mm3 \n\t"
02022 "pandn %%mm1, %%mm4 \n\t"
02023 "pfadd %%mm0, %%mm3 \n\t"
02024 "pfsub %%mm4, %%mm0 \n\t"
02025 "movq %%mm3, %1 \n\t"
02026 "movq %%mm0, %0 \n\t"
02027 :"+m"(mag[i]), "+m"(ang[i])
02028 ::"memory"
02029 );
02030 }
02031 __asm__ volatile("femms");
02032 }
02033 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
02034 {
02035 int i;
02036
02037 __asm__ volatile(
02038 "movaps %0, %%xmm5 \n\t"
02039 ::"m"(ff_pdw_80000000[0])
02040 );
02041 for(i=0; i<blocksize; i+=4) {
02042 __asm__ volatile(
02043 "movaps %0, %%xmm0 \n\t"
02044 "movaps %1, %%xmm1 \n\t"
02045 "xorps %%xmm2, %%xmm2 \n\t"
02046 "xorps %%xmm3, %%xmm3 \n\t"
02047 "cmpleps %%xmm0, %%xmm2 \n\t"
02048 "cmpleps %%xmm1, %%xmm3 \n\t"
02049 "andps %%xmm5, %%xmm2 \n\t"
02050 "xorps %%xmm2, %%xmm1 \n\t"
02051 "movaps %%xmm3, %%xmm4 \n\t"
02052 "andps %%xmm1, %%xmm3 \n\t"
02053 "andnps %%xmm1, %%xmm4 \n\t"
02054 "addps %%xmm0, %%xmm3 \n\t"
02055 "subps %%xmm4, %%xmm0 \n\t"
02056 "movaps %%xmm3, %1 \n\t"
02057 "movaps %%xmm0, %0 \n\t"
02058 :"+m"(mag[i]), "+m"(ang[i])
02059 ::"memory"
02060 );
02061 }
02062 }
02063
02064 #define IF1(x) x
02065 #define IF0(x)
02066
02067 #define MIX5(mono,stereo)\
02068 __asm__ volatile(\
02069 "movss 0(%2), %%xmm5 \n"\
02070 "movss 8(%2), %%xmm6 \n"\
02071 "movss 24(%2), %%xmm7 \n"\
02072 "shufps $0, %%xmm5, %%xmm5 \n"\
02073 "shufps $0, %%xmm6, %%xmm6 \n"\
02074 "shufps $0, %%xmm7, %%xmm7 \n"\
02075 "1: \n"\
02076 "movaps (%0,%1), %%xmm0 \n"\
02077 "movaps 0x400(%0,%1), %%xmm1 \n"\
02078 "movaps 0x800(%0,%1), %%xmm2 \n"\
02079 "movaps 0xc00(%0,%1), %%xmm3 \n"\
02080 "movaps 0x1000(%0,%1), %%xmm4 \n"\
02081 "mulps %%xmm5, %%xmm0 \n"\
02082 "mulps %%xmm6, %%xmm1 \n"\
02083 "mulps %%xmm5, %%xmm2 \n"\
02084 "mulps %%xmm7, %%xmm3 \n"\
02085 "mulps %%xmm7, %%xmm4 \n"\
02086 stereo("addps %%xmm1, %%xmm0 \n")\
02087 "addps %%xmm1, %%xmm2 \n"\
02088 "addps %%xmm3, %%xmm0 \n"\
02089 "addps %%xmm4, %%xmm2 \n"\
02090 mono("addps %%xmm2, %%xmm0 \n")\
02091 "movaps %%xmm0, (%0,%1) \n"\
02092 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
02093 "add $16, %0 \n"\
02094 "jl 1b \n"\
02095 :"+&r"(i)\
02096 :"r"(samples[0]+len), "r"(matrix)\
02097 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
02098 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
02099 "memory"\
02100 );
02101
02102 #define MIX_MISC(stereo)\
02103 __asm__ volatile(\
02104 "1: \n"\
02105 "movaps (%3,%0), %%xmm0 \n"\
02106 stereo("movaps %%xmm0, %%xmm1 \n")\
02107 "mulps %%xmm4, %%xmm0 \n"\
02108 stereo("mulps %%xmm5, %%xmm1 \n")\
02109 "lea 1024(%3,%0), %1 \n"\
02110 "mov %5, %2 \n"\
02111 "2: \n"\
02112 "movaps (%1), %%xmm2 \n"\
02113 stereo("movaps %%xmm2, %%xmm3 \n")\
02114 "mulps (%4,%2), %%xmm2 \n"\
02115 stereo("mulps 16(%4,%2), %%xmm3 \n")\
02116 "addps %%xmm2, %%xmm0 \n"\
02117 stereo("addps %%xmm3, %%xmm1 \n")\
02118 "add $1024, %1 \n"\
02119 "add $32, %2 \n"\
02120 "jl 2b \n"\
02121 "movaps %%xmm0, (%3,%0) \n"\
02122 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
02123 "add $16, %0 \n"\
02124 "jl 1b \n"\
02125 :"+&r"(i), "=&r"(j), "=&r"(k)\
02126 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
02127 :"memory"\
02128 );
02129
02130 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
02131 {
02132 int (*matrix_cmp)[2] = (int(*)[2])matrix;
02133 intptr_t i,j,k;
02134
02135 i = -len*sizeof(float);
02136 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
02137 MIX5(IF0,IF1);
02138 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
02139 MIX5(IF1,IF0);
02140 } else {
02141 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
02142 j = 2*in_ch*sizeof(float);
02143 __asm__ volatile(
02144 "1: \n"
02145 "sub $8, %0 \n"
02146 "movss (%2,%0), %%xmm4 \n"
02147 "movss 4(%2,%0), %%xmm5 \n"
02148 "shufps $0, %%xmm4, %%xmm4 \n"
02149 "shufps $0, %%xmm5, %%xmm5 \n"
02150 "movaps %%xmm4, (%1,%0,4) \n"
02151 "movaps %%xmm5, 16(%1,%0,4) \n"
02152 "jg 1b \n"
02153 :"+&r"(j)
02154 :"r"(matrix_simd), "r"(matrix)
02155 :"memory"
02156 );
02157 if(out_ch == 2) {
02158 MIX_MISC(IF1);
02159 } else {
02160 MIX_MISC(IF0);
02161 }
02162 }
02163 }
02164
02165 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
02166 x86_reg i = (len-4)*4;
02167 __asm__ volatile(
02168 "1: \n\t"
02169 "movq (%2,%0), %%mm0 \n\t"
02170 "movq 8(%2,%0), %%mm1 \n\t"
02171 "pfmul (%3,%0), %%mm0 \n\t"
02172 "pfmul 8(%3,%0), %%mm1 \n\t"
02173 "movq %%mm0, (%1,%0) \n\t"
02174 "movq %%mm1, 8(%1,%0) \n\t"
02175 "sub $16, %0 \n\t"
02176 "jge 1b \n\t"
02177 "femms \n\t"
02178 :"+r"(i)
02179 :"r"(dst), "r"(src0), "r"(src1)
02180 :"memory"
02181 );
02182 }
02183 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
02184 x86_reg i = (len-8)*4;
02185 __asm__ volatile(
02186 "1: \n\t"
02187 "movaps (%2,%0), %%xmm0 \n\t"
02188 "movaps 16(%2,%0), %%xmm1 \n\t"
02189 "mulps (%3,%0), %%xmm0 \n\t"
02190 "mulps 16(%3,%0), %%xmm1 \n\t"
02191 "movaps %%xmm0, (%1,%0) \n\t"
02192 "movaps %%xmm1, 16(%1,%0) \n\t"
02193 "sub $32, %0 \n\t"
02194 "jge 1b \n\t"
02195 :"+r"(i)
02196 :"r"(dst), "r"(src0), "r"(src1)
02197 :"memory"
02198 );
02199 }
02200
02201 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
02202 x86_reg i = len*4-16;
02203 __asm__ volatile(
02204 "1: \n\t"
02205 "pswapd 8(%1), %%mm0 \n\t"
02206 "pswapd (%1), %%mm1 \n\t"
02207 "pfmul (%3,%0), %%mm0 \n\t"
02208 "pfmul 8(%3,%0), %%mm1 \n\t"
02209 "movq %%mm0, (%2,%0) \n\t"
02210 "movq %%mm1, 8(%2,%0) \n\t"
02211 "add $16, %1 \n\t"
02212 "sub $16, %0 \n\t"
02213 "jge 1b \n\t"
02214 :"+r"(i), "+r"(src1)
02215 :"r"(dst), "r"(src0)
02216 );
02217 __asm__ volatile("femms");
02218 }
02219 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
02220 x86_reg i = len*4-32;
02221 __asm__ volatile(
02222 "1: \n\t"
02223 "movaps 16(%1), %%xmm0 \n\t"
02224 "movaps (%1), %%xmm1 \n\t"
02225 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
02226 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
02227 "mulps (%3,%0), %%xmm0 \n\t"
02228 "mulps 16(%3,%0), %%xmm1 \n\t"
02229 "movaps %%xmm0, (%2,%0) \n\t"
02230 "movaps %%xmm1, 16(%2,%0) \n\t"
02231 "add $32, %1 \n\t"
02232 "sub $32, %0 \n\t"
02233 "jge 1b \n\t"
02234 :"+r"(i), "+r"(src1)
02235 :"r"(dst), "r"(src0)
02236 );
02237 }
02238
02239 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
02240 const float *src2, int len){
02241 x86_reg i = (len-4)*4;
02242 __asm__ volatile(
02243 "1: \n\t"
02244 "movq (%2,%0), %%mm0 \n\t"
02245 "movq 8(%2,%0), %%mm1 \n\t"
02246 "pfmul (%3,%0), %%mm0 \n\t"
02247 "pfmul 8(%3,%0), %%mm1 \n\t"
02248 "pfadd (%4,%0), %%mm0 \n\t"
02249 "pfadd 8(%4,%0), %%mm1 \n\t"
02250 "movq %%mm0, (%1,%0) \n\t"
02251 "movq %%mm1, 8(%1,%0) \n\t"
02252 "sub $16, %0 \n\t"
02253 "jge 1b \n\t"
02254 :"+r"(i)
02255 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
02256 :"memory"
02257 );
02258 __asm__ volatile("femms");
02259 }
02260 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
02261 const float *src2, int len){
02262 x86_reg i = (len-8)*4;
02263 __asm__ volatile(
02264 "1: \n\t"
02265 "movaps (%2,%0), %%xmm0 \n\t"
02266 "movaps 16(%2,%0), %%xmm1 \n\t"
02267 "mulps (%3,%0), %%xmm0 \n\t"
02268 "mulps 16(%3,%0), %%xmm1 \n\t"
02269 "addps (%4,%0), %%xmm0 \n\t"
02270 "addps 16(%4,%0), %%xmm1 \n\t"
02271 "movaps %%xmm0, (%1,%0) \n\t"
02272 "movaps %%xmm1, 16(%1,%0) \n\t"
02273 "sub $32, %0 \n\t"
02274 "jge 1b \n\t"
02275 :"+r"(i)
02276 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
02277 :"memory"
02278 );
02279 }
02280
02281 #if HAVE_6REGS
02282 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
02283 const float *win, int len){
02284 x86_reg i = -len*4;
02285 x86_reg j = len*4-8;
02286 __asm__ volatile(
02287 "1: \n"
02288 "pswapd (%5,%1), %%mm1 \n"
02289 "movq (%5,%0), %%mm0 \n"
02290 "pswapd (%4,%1), %%mm5 \n"
02291 "movq (%3,%0), %%mm4 \n"
02292 "movq %%mm0, %%mm2 \n"
02293 "movq %%mm1, %%mm3 \n"
02294 "pfmul %%mm4, %%mm2 \n"
02295 "pfmul %%mm5, %%mm3 \n"
02296 "pfmul %%mm4, %%mm1 \n"
02297 "pfmul %%mm5, %%mm0 \n"
02298 "pfadd %%mm3, %%mm2 \n"
02299 "pfsub %%mm0, %%mm1 \n"
02300 "pswapd %%mm2, %%mm2 \n"
02301 "movq %%mm1, (%2,%0) \n"
02302 "movq %%mm2, (%2,%1) \n"
02303 "sub $8, %1 \n"
02304 "add $8, %0 \n"
02305 "jl 1b \n"
02306 "femms \n"
02307 :"+r"(i), "+r"(j)
02308 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
02309 );
02310 }
02311
02312 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
02313 const float *win, int len){
02314 x86_reg i = -len*4;
02315 x86_reg j = len*4-16;
02316 __asm__ volatile(
02317 "1: \n"
02318 "movaps (%5,%1), %%xmm1 \n"
02319 "movaps (%5,%0), %%xmm0 \n"
02320 "movaps (%4,%1), %%xmm5 \n"
02321 "movaps (%3,%0), %%xmm4 \n"
02322 "shufps $0x1b, %%xmm1, %%xmm1 \n"
02323 "shufps $0x1b, %%xmm5, %%xmm5 \n"
02324 "movaps %%xmm0, %%xmm2 \n"
02325 "movaps %%xmm1, %%xmm3 \n"
02326 "mulps %%xmm4, %%xmm2 \n"
02327 "mulps %%xmm5, %%xmm3 \n"
02328 "mulps %%xmm4, %%xmm1 \n"
02329 "mulps %%xmm5, %%xmm0 \n"
02330 "addps %%xmm3, %%xmm2 \n"
02331 "subps %%xmm0, %%xmm1 \n"
02332 "shufps $0x1b, %%xmm2, %%xmm2 \n"
02333 "movaps %%xmm1, (%2,%0) \n"
02334 "movaps %%xmm2, (%2,%1) \n"
02335 "sub $16, %1 \n"
02336 "add $16, %0 \n"
02337 "jl 1b \n"
02338 :"+r"(i), "+r"(j)
02339 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
02340 );
02341 }
02342 #endif
02343
02344 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
02345 int len)
02346 {
02347 x86_reg i = (len-16)*4;
02348 __asm__ volatile(
02349 "movss %3, %%xmm4 \n"
02350 "movss %4, %%xmm5 \n"
02351 "shufps $0, %%xmm4, %%xmm4 \n"
02352 "shufps $0, %%xmm5, %%xmm5 \n"
02353 "1: \n\t"
02354 "movaps (%2,%0), %%xmm0 \n\t"
02355 "movaps 16(%2,%0), %%xmm1 \n\t"
02356 "movaps 32(%2,%0), %%xmm2 \n\t"
02357 "movaps 48(%2,%0), %%xmm3 \n\t"
02358 "maxps %%xmm4, %%xmm0 \n\t"
02359 "maxps %%xmm4, %%xmm1 \n\t"
02360 "maxps %%xmm4, %%xmm2 \n\t"
02361 "maxps %%xmm4, %%xmm3 \n\t"
02362 "minps %%xmm5, %%xmm0 \n\t"
02363 "minps %%xmm5, %%xmm1 \n\t"
02364 "minps %%xmm5, %%xmm2 \n\t"
02365 "minps %%xmm5, %%xmm3 \n\t"
02366 "movaps %%xmm0, (%1,%0) \n\t"
02367 "movaps %%xmm1, 16(%1,%0) \n\t"
02368 "movaps %%xmm2, 32(%1,%0) \n\t"
02369 "movaps %%xmm3, 48(%1,%0) \n\t"
02370 "sub $64, %0 \n\t"
02371 "jge 1b \n\t"
02372 :"+&r"(i)
02373 :"r"(dst), "r"(src), "m"(min), "m"(max)
02374 :"memory"
02375 );
02376 }
02377
02378 void ff_vp3_idct_mmx(int16_t *input_data);
02379 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
02380 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
02381
02382 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
02383
02384 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
02385 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
02386
02387 void ff_vp3_idct_sse2(int16_t *input_data);
02388 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
02389 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
02390
02391 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
02392 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
02393 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02394 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02395 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02396 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
02397 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
02398 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
02399
02400 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
02401
02402 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
02403 {
02404 int mm_flags = av_get_cpu_flags();
02405
02406 if (avctx->dsp_mask) {
02407 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
02408 mm_flags |= (avctx->dsp_mask & 0xffff);
02409 else
02410 mm_flags &= ~(avctx->dsp_mask & 0xffff);
02411 }
02412
02413 #if 0
02414 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
02415 if (mm_flags & AV_CPU_FLAG_MMX)
02416 av_log(avctx, AV_LOG_INFO, " mmx");
02417 if (mm_flags & AV_CPU_FLAG_MMX2)
02418 av_log(avctx, AV_LOG_INFO, " mmx2");
02419 if (mm_flags & AV_CPU_FLAG_3DNOW)
02420 av_log(avctx, AV_LOG_INFO, " 3dnow");
02421 if (mm_flags & AV_CPU_FLAG_SSE)
02422 av_log(avctx, AV_LOG_INFO, " sse");
02423 if (mm_flags & AV_CPU_FLAG_SSE2)
02424 av_log(avctx, AV_LOG_INFO, " sse2");
02425 av_log(avctx, AV_LOG_INFO, "\n");
02426 #endif
02427
02428 if (mm_flags & AV_CPU_FLAG_MMX) {
02429 const int idct_algo= avctx->idct_algo;
02430
02431 if(avctx->lowres==0){
02432 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
02433 c->idct_put= ff_simple_idct_put_mmx;
02434 c->idct_add= ff_simple_idct_add_mmx;
02435 c->idct = ff_simple_idct_mmx;
02436 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
02437 #if CONFIG_GPL
02438 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
02439 if(mm_flags & AV_CPU_FLAG_MMX2){
02440 c->idct_put= ff_libmpeg2mmx2_idct_put;
02441 c->idct_add= ff_libmpeg2mmx2_idct_add;
02442 c->idct = ff_mmxext_idct;
02443 }else{
02444 c->idct_put= ff_libmpeg2mmx_idct_put;
02445 c->idct_add= ff_libmpeg2mmx_idct_add;
02446 c->idct = ff_mmx_idct;
02447 }
02448 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
02449 #endif
02450 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
02451 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
02452 if(mm_flags & AV_CPU_FLAG_SSE2){
02453 c->idct_put= ff_vp3_idct_put_sse2;
02454 c->idct_add= ff_vp3_idct_add_sse2;
02455 c->idct = ff_vp3_idct_sse2;
02456 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
02457 }else{
02458 c->idct_put= ff_vp3_idct_put_mmx;
02459 c->idct_add= ff_vp3_idct_add_mmx;
02460 c->idct = ff_vp3_idct_mmx;
02461 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
02462 }
02463 }else if(idct_algo==FF_IDCT_CAVS){
02464 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
02465 }else if(idct_algo==FF_IDCT_XVIDMMX){
02466 if(mm_flags & AV_CPU_FLAG_SSE2){
02467 c->idct_put= ff_idct_xvid_sse2_put;
02468 c->idct_add= ff_idct_xvid_sse2_add;
02469 c->idct = ff_idct_xvid_sse2;
02470 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
02471 }else if(mm_flags & AV_CPU_FLAG_MMX2){
02472 c->idct_put= ff_idct_xvid_mmx2_put;
02473 c->idct_add= ff_idct_xvid_mmx2_add;
02474 c->idct = ff_idct_xvid_mmx2;
02475 }else{
02476 c->idct_put= ff_idct_xvid_mmx_put;
02477 c->idct_add= ff_idct_xvid_mmx_add;
02478 c->idct = ff_idct_xvid_mmx;
02479 }
02480 }
02481 }
02482
02483 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
02484 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
02485 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
02486 c->clear_block = clear_block_mmx;
02487 c->clear_blocks = clear_blocks_mmx;
02488 if ((mm_flags & AV_CPU_FLAG_SSE) &&
02489 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
02490
02491 c->clear_block = clear_block_sse;
02492 c->clear_blocks = clear_blocks_sse;
02493 }
02494
02495 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
02496 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
02497 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
02498 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
02499 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
02500
02501 SET_HPEL_FUNCS(put, 0, 16, mmx);
02502 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
02503 SET_HPEL_FUNCS(avg, 0, 16, mmx);
02504 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
02505 SET_HPEL_FUNCS(put, 1, 8, mmx);
02506 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
02507 SET_HPEL_FUNCS(avg, 1, 8, mmx);
02508 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
02509
02510 #if ARCH_X86_32 || !HAVE_YASM
02511 c->gmc= gmc_mmx;
02512 #endif
02513 #if ARCH_X86_32 && HAVE_YASM
02514 c->emulated_edge_mc = emulated_edge_mc_mmx;
02515 #endif
02516
02517 c->add_bytes= add_bytes_mmx;
02518 c->add_bytes_l2= add_bytes_l2_mmx;
02519
02520 c->draw_edges = draw_edges_mmx;
02521
02522 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
02523 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
02524 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
02525 }
02526
02527 #if HAVE_YASM
02528 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
02529 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
02530
02531 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
02532 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
02533 #endif
02534
02535 if (mm_flags & AV_CPU_FLAG_MMX2) {
02536 c->prefetch = prefetch_mmx2;
02537
02538 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
02539 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
02540
02541 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
02542 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
02543 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
02544
02545 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
02546 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
02547
02548 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
02549 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
02550 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
02551
02552 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02553 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
02554 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
02555 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
02556 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
02557 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
02558 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
02559
02560 if (CONFIG_VP3_DECODER && HAVE_YASM) {
02561 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
02562 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
02563 }
02564 }
02565 if (CONFIG_VP3_DECODER && HAVE_YASM) {
02566 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
02567 }
02568
02569 if (CONFIG_VP3_DECODER
02570 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
02571 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
02572 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
02573 }
02574
02575 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
02576 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
02577 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
02578 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
02579 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
02580 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
02581 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
02582 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
02583 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
02584 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
02585 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
02586 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
02587 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
02588 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
02589 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
02590 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
02591 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
02592
02593 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
02594 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
02595 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
02596 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
02597 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
02598 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
02599
02600 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
02601 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
02602 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
02603 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
02604 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
02605 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
02606
02607 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
02608 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
02609 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
02610 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
02611
02612 #if HAVE_YASM
02613 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
02614 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
02615
02616 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
02617 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
02618 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
02619 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
02620
02621 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
02622 #endif
02623 #if HAVE_7REGS && HAVE_TEN_OPERANDS
02624 if( mm_flags&AV_CPU_FLAG_3DNOW )
02625 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
02626 #endif
02627
02628 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
02629 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
02630 c->prefetch = prefetch_3dnow;
02631
02632 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
02633 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
02634
02635 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
02636 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
02637 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
02638
02639 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
02640 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
02641
02642 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
02643 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
02644 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
02645
02646 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02647 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
02648 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
02649 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
02650 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
02651 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
02652 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
02653 }
02654
02655 if (CONFIG_VP3_DECODER
02656 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
02657 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
02658 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
02659 }
02660
02661 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
02662 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
02663 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
02664 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
02665 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
02666 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
02667
02668 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
02669 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
02670 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
02671 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
02672 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
02673 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
02674
02675 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
02676 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
02677 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
02678 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
02679
02680 #if HAVE_YASM
02681 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
02682 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
02683
02684 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
02685 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
02686 #endif
02687 }
02688
02689
02690 #define H264_QPEL_FUNCS(x, y, CPU)\
02691 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
02692 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
02693 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
02694 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
02695 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
02696
02697 c->put_pixels_tab[0][0] = put_pixels16_sse2;
02698 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
02699 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
02700 H264_QPEL_FUNCS(0, 0, sse2);
02701 }
02702 if(mm_flags & AV_CPU_FLAG_SSE2){
02703 H264_QPEL_FUNCS(0, 1, sse2);
02704 H264_QPEL_FUNCS(0, 2, sse2);
02705 H264_QPEL_FUNCS(0, 3, sse2);
02706 H264_QPEL_FUNCS(1, 1, sse2);
02707 H264_QPEL_FUNCS(1, 2, sse2);
02708 H264_QPEL_FUNCS(1, 3, sse2);
02709 H264_QPEL_FUNCS(2, 1, sse2);
02710 H264_QPEL_FUNCS(2, 2, sse2);
02711 H264_QPEL_FUNCS(2, 3, sse2);
02712 H264_QPEL_FUNCS(3, 1, sse2);
02713 H264_QPEL_FUNCS(3, 2, sse2);
02714 H264_QPEL_FUNCS(3, 3, sse2);
02715 }
02716 #if HAVE_SSSE3
02717 if(mm_flags & AV_CPU_FLAG_SSSE3){
02718 H264_QPEL_FUNCS(1, 0, ssse3);
02719 H264_QPEL_FUNCS(1, 1, ssse3);
02720 H264_QPEL_FUNCS(1, 2, ssse3);
02721 H264_QPEL_FUNCS(1, 3, ssse3);
02722 H264_QPEL_FUNCS(2, 0, ssse3);
02723 H264_QPEL_FUNCS(2, 1, ssse3);
02724 H264_QPEL_FUNCS(2, 2, ssse3);
02725 H264_QPEL_FUNCS(2, 3, ssse3);
02726 H264_QPEL_FUNCS(3, 0, ssse3);
02727 H264_QPEL_FUNCS(3, 1, ssse3);
02728 H264_QPEL_FUNCS(3, 2, ssse3);
02729 H264_QPEL_FUNCS(3, 3, ssse3);
02730 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
02731 #if HAVE_YASM
02732 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
02733 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
02734 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
02735 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
02736 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
02737 if (mm_flags & AV_CPU_FLAG_SSE4)
02738 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
02739 #endif
02740 }
02741 #endif
02742
02743 if(mm_flags & AV_CPU_FLAG_3DNOW){
02744 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
02745 c->vector_fmul = vector_fmul_3dnow;
02746 }
02747 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
02748 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
02749 #if HAVE_6REGS
02750 c->vector_fmul_window = vector_fmul_window_3dnow2;
02751 #endif
02752 }
02753 if(mm_flags & AV_CPU_FLAG_MMX2){
02754 #if HAVE_YASM
02755 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
02756 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
02757 #endif
02758 }
02759 if(mm_flags & AV_CPU_FLAG_SSE){
02760 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
02761 c->ac3_downmix = ac3_downmix_sse;
02762 c->vector_fmul = vector_fmul_sse;
02763 c->vector_fmul_reverse = vector_fmul_reverse_sse;
02764 c->vector_fmul_add = vector_fmul_add_sse;
02765 #if HAVE_6REGS
02766 c->vector_fmul_window = vector_fmul_window_sse;
02767 #endif
02768 c->vector_clipf = vector_clipf_sse;
02769 #if HAVE_YASM
02770 c->scalarproduct_float = ff_scalarproduct_float_sse;
02771 #endif
02772 }
02773 if(mm_flags & AV_CPU_FLAG_3DNOW)
02774 c->vector_fmul_add = vector_fmul_add_3dnow;
02775 if(mm_flags & AV_CPU_FLAG_SSE2){
02776 #if HAVE_YASM
02777 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
02778 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
02779
02780 c->emulated_edge_mc = emulated_edge_mc_sse;
02781 c->gmc= gmc_sse;
02782 #endif
02783 }
02784 if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM)
02785 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
02786 }
02787
02788 if (CONFIG_ENCODERS)
02789 dsputilenc_init_mmx(c, avctx);
02790
02791 #if 0
02792
02793 get_pixels = just_return;
02794 put_pixels_clamped = just_return;
02795 add_pixels_clamped = just_return;
02796
02797 pix_abs16x16 = just_return;
02798 pix_abs16x16_x2 = just_return;
02799 pix_abs16x16_y2 = just_return;
02800 pix_abs16x16_xy2 = just_return;
02801
02802 put_pixels_tab[0] = just_return;
02803 put_pixels_tab[1] = just_return;
02804 put_pixels_tab[2] = just_return;
02805 put_pixels_tab[3] = just_return;
02806
02807 put_no_rnd_pixels_tab[0] = just_return;
02808 put_no_rnd_pixels_tab[1] = just_return;
02809 put_no_rnd_pixels_tab[2] = just_return;
02810 put_no_rnd_pixels_tab[3] = just_return;
02811
02812 avg_pixels_tab[0] = just_return;
02813 avg_pixels_tab[1] = just_return;
02814 avg_pixels_tab[2] = just_return;
02815 avg_pixels_tab[3] = just_return;
02816
02817 avg_no_rnd_pixels_tab[0] = just_return;
02818 avg_no_rnd_pixels_tab[1] = just_return;
02819 avg_no_rnd_pixels_tab[2] = just_return;
02820 avg_no_rnd_pixels_tab[3] = just_return;
02821
02822
02823
02824 #endif
02825 }