• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libavcodec/x86/simple_idct_mmx.c

Go to the documentation of this file.
00001 /*
00002  * Simple IDCT MMX
00003  *
00004  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * This file is part of Libav.
00007  *
00008  * Libav is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * Libav is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with Libav; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 #include "libavcodec/dsputil.h"
00023 #include "libavcodec/simple_idct.h"
00024 #include "dsputil_mmx.h"
00025 
00026 /*
00027 23170.475006
00028 22725.260826
00029 21406.727617
00030 19265.545870
00031 16384.000000
00032 12872.826198
00033 8866.956905
00034 4520.335430
00035 */
00036 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00037 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00038 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00039 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00040 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
00041 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00042 #define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00043 #define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00044 
00045 #define ROW_SHIFT 11
00046 #define COL_SHIFT 20 // 6
00047 
00048 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
00049 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
00050 
00051 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
00052         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
00053 //        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
00054 //        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
00055         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
00056         // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
00057 //        0, 0, 0, 0,
00058 //        0, 0, 0, 0,
00059 
00060  C4,  C4,  C4,  C4,
00061  C4, -C4,  C4, -C4,
00062 
00063  C2,  C6,  C2,  C6,
00064  C6, -C2,  C6, -C2,
00065 
00066  C1,  C3,  C1,  C3,
00067  C5,  C7,  C5,  C7,
00068 
00069  C3, -C7,  C3, -C7,
00070 -C1, -C5, -C1, -C5,
00071 
00072  C5, -C1,  C5, -C1,
00073  C7,  C3,  C7,  C3,
00074 
00075  C7, -C5,  C7, -C5,
00076  C3, -C1,  C3, -C1
00077 };
00078 
00079 static inline void idct(int16_t *block)
00080 {
00081         DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
00082         int16_t * const temp= (int16_t*)align_tmp;
00083 
00084         __asm__ volatile(
00085 #if 0 //Alternative, simpler variant
00086 
00087 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00088         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00089         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00090         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00091         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00092         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00093         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00094         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00095         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00096         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00097         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00098         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00099         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00100         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00101         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00102         #rounder ", %%mm4               \n\t"\
00103         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00104         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00105         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00106         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00107         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00108         #rounder ", %%mm0               \n\t"\
00109         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00110         "paddd %%mm0, %%mm0             \n\t" \
00111         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00112         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00113         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00114         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00115         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00116         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00117         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00118         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00119         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00120         "psrad $" #shift ", %%mm7       \n\t"\
00121         "psrad $" #shift ", %%mm4       \n\t"\
00122         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00123         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00124         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00125         "psrad $" #shift ", %%mm1       \n\t"\
00126         "psrad $" #shift ", %%mm2       \n\t"\
00127         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00128         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00129         "movq %%mm7, " #dst "           \n\t"\
00130         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00131         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00132         "movq %%mm2, 24+" #dst "        \n\t"\
00133         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00134         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00135         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00136         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00137         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00138         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00139         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00140         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00141         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00142         "psrad $" #shift ", %%mm2       \n\t"\
00143         "psrad $" #shift ", %%mm0       \n\t"\
00144         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00145         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00146         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00147         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00148         "psrad $" #shift ", %%mm6       \n\t"\
00149         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00150         "movq %%mm2, 8+" #dst "         \n\t"\
00151         "psrad $" #shift ", %%mm4       \n\t"\
00152         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00153         "movq %%mm4, 16+" #dst "        \n\t"\
00154 
00155 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
00156         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00157         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00158         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00159         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00160         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00161         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00162         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00163         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00164         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00165         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00166         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00167         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00168         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00169         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00170         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00171         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00172         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00173         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00174         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
00175         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
00176         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00177         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00178         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00179         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
00180         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
00181         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00182         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00183         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00184         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00185         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
00186         "psrad $" #shift ", %%mm7       \n\t"\
00187         "psrad $" #shift ", %%mm4       \n\t"\
00188         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00189         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00190         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00191         "psrad $" #shift ", %%mm0       \n\t"\
00192         "psrad $" #shift ", %%mm2       \n\t"\
00193         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
00194         "movd %%mm7, " #dst "           \n\t"\
00195         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00196         "movd %%mm0, 16+" #dst "        \n\t"\
00197         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00198         "movd %%mm2, 96+" #dst "        \n\t"\
00199         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00200         "movd %%mm4, 112+" #dst "       \n\t"\
00201         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
00202         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00203         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00204         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00205         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00206         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00207         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00208         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00209         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00210         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00211         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00212         "psrad $" #shift ", %%mm2       \n\t"\
00213         "psrad $" #shift ", %%mm5       \n\t"\
00214         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00215         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
00216         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00217         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00218         "psrad $" #shift ", %%mm6       \n\t"\
00219         "psrad $" #shift ", %%mm4       \n\t"\
00220         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00221         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00222         "movd %%mm2, 32+" #dst "        \n\t"\
00223         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
00224         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00225         "movd %%mm6, 48+" #dst "        \n\t"\
00226         "movd %%mm4, 64+" #dst "        \n\t"\
00227         "movd %%mm5, 80+" #dst "        \n\t"\
00228 
00229 
00230 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00231         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00232         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00233         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00234         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00235         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
00236         "pand %%mm0, %%mm4              \n\t"\
00237         "por %%mm1, %%mm4               \n\t"\
00238         "por %%mm2, %%mm4               \n\t"\
00239         "por %%mm3, %%mm4               \n\t"\
00240         "packssdw %%mm4,%%mm4           \n\t"\
00241         "movd %%mm4, %%eax              \n\t"\
00242         "orl %%eax, %%eax               \n\t"\
00243         "jz 1f                          \n\t"\
00244         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00245         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00246         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00247         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00248         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00249         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00250         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00251         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00252         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00253         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00254         #rounder ", %%mm4               \n\t"\
00255         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00256         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00257         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00258         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00259         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00260         #rounder ", %%mm0               \n\t"\
00261         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00262         "paddd %%mm0, %%mm0             \n\t" \
00263         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00264         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00265         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00266         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00267         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00268         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00269         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00270         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00271         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00272         "psrad $" #shift ", %%mm7       \n\t"\
00273         "psrad $" #shift ", %%mm4       \n\t"\
00274         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00275         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00276         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00277         "psrad $" #shift ", %%mm1       \n\t"\
00278         "psrad $" #shift ", %%mm2       \n\t"\
00279         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00280         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00281         "movq %%mm7, " #dst "           \n\t"\
00282         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00283         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00284         "movq %%mm2, 24+" #dst "        \n\t"\
00285         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00286         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00287         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00288         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00289         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00290         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00291         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00292         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00293         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00294         "psrad $" #shift ", %%mm2       \n\t"\
00295         "psrad $" #shift ", %%mm0       \n\t"\
00296         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00297         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00298         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00299         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00300         "psrad $" #shift ", %%mm6       \n\t"\
00301         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00302         "movq %%mm2, 8+" #dst "         \n\t"\
00303         "psrad $" #shift ", %%mm4       \n\t"\
00304         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00305         "movq %%mm4, 16+" #dst "        \n\t"\
00306         "jmp 2f                         \n\t"\
00307         "1:                             \n\t"\
00308         "pslld $16, %%mm0               \n\t"\
00309         "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
00310         "psrad $13, %%mm0               \n\t"\
00311         "packssdw %%mm0, %%mm0          \n\t"\
00312         "movq %%mm0, " #dst "           \n\t"\
00313         "movq %%mm0, 8+" #dst "         \n\t"\
00314         "movq %%mm0, 16+" #dst "        \n\t"\
00315         "movq %%mm0, 24+" #dst "        \n\t"\
00316         "2:                             \n\t"
00317 
00318 
00319 //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
00320 ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
00321 /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
00322 ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
00323 ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
00324 
00325 DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
00326 DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
00327 DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
00328 
00329 
00330 //IDCT(      src0,   src4,   src1,    src5,    dst, shift)
00331 COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00332 COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00333 COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00334 COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00335 
00336 #else
00337 
00338 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00339         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00340         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00341         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00342         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00343         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
00344         "pand %%mm0, %%mm4              \n\t"\
00345         "por %%mm1, %%mm4               \n\t"\
00346         "por %%mm2, %%mm4               \n\t"\
00347         "por %%mm3, %%mm4               \n\t"\
00348         "packssdw %%mm4,%%mm4           \n\t"\
00349         "movd %%mm4, %%eax              \n\t"\
00350         "orl %%eax, %%eax               \n\t"\
00351         "jz 1f                          \n\t"\
00352         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00353         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00354         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00355         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00356         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00357         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00358         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00359         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00360         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00361         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00362         #rounder ", %%mm4               \n\t"\
00363         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00364         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00365         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00366         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00367         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00368         #rounder ", %%mm0               \n\t"\
00369         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00370         "paddd %%mm0, %%mm0             \n\t" \
00371         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00372         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00373         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00374         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00375         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00376         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00377         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00378         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00379         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00380         "psrad $" #shift ", %%mm7       \n\t"\
00381         "psrad $" #shift ", %%mm4       \n\t"\
00382         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00383         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00384         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00385         "psrad $" #shift ", %%mm1       \n\t"\
00386         "psrad $" #shift ", %%mm2       \n\t"\
00387         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00388         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00389         "movq %%mm7, " #dst "           \n\t"\
00390         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00391         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00392         "movq %%mm2, 24+" #dst "        \n\t"\
00393         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00394         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00395         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00396         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00397         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00398         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00399         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00400         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00401         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00402         "psrad $" #shift ", %%mm2       \n\t"\
00403         "psrad $" #shift ", %%mm0       \n\t"\
00404         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00405         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00406         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00407         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00408         "psrad $" #shift ", %%mm6       \n\t"\
00409         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00410         "movq %%mm2, 8+" #dst "         \n\t"\
00411         "psrad $" #shift ", %%mm4       \n\t"\
00412         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00413         "movq %%mm4, 16+" #dst "        \n\t"\
00414         "jmp 2f                         \n\t"\
00415         "1:                             \n\t"\
00416         "pslld $16, %%mm0               \n\t"\
00417         "paddd "MANGLE(d40000)", %%mm0  \n\t"\
00418         "psrad $13, %%mm0               \n\t"\
00419         "packssdw %%mm0, %%mm0          \n\t"\
00420         "movq %%mm0, " #dst "           \n\t"\
00421         "movq %%mm0, 8+" #dst "         \n\t"\
00422         "movq %%mm0, 16+" #dst "        \n\t"\
00423         "movq %%mm0, 24+" #dst "        \n\t"\
00424         "2:                             \n\t"
00425 
00426 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
00427         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00428         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00429         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00430         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00431         "movq %%mm0, %%mm4              \n\t"\
00432         "por %%mm1, %%mm4               \n\t"\
00433         "por %%mm2, %%mm4               \n\t"\
00434         "por %%mm3, %%mm4               \n\t"\
00435         "packssdw %%mm4,%%mm4           \n\t"\
00436         "movd %%mm4, %%eax              \n\t"\
00437         "orl %%eax, %%eax               \n\t"\
00438         "jz " #bt "                     \n\t"\
00439         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00440         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00441         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00442         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00443         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00444         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00445         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00446         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00447         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00448         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00449         #rounder ", %%mm4               \n\t"\
00450         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00451         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00452         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00453         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00454         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00455         #rounder ", %%mm0               \n\t"\
00456         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00457         "paddd %%mm0, %%mm0             \n\t" \
00458         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00459         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00460         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00461         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00462         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00463         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00464         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00465         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00466         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00467         "psrad $" #shift ", %%mm7       \n\t"\
00468         "psrad $" #shift ", %%mm4       \n\t"\
00469         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00470         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00471         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00472         "psrad $" #shift ", %%mm1       \n\t"\
00473         "psrad $" #shift ", %%mm2       \n\t"\
00474         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00475         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00476         "movq %%mm7, " #dst "           \n\t"\
00477         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00478         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00479         "movq %%mm2, 24+" #dst "        \n\t"\
00480         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00481         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00482         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00483         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00484         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00485         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00486         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00487         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00488         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00489         "psrad $" #shift ", %%mm2       \n\t"\
00490         "psrad $" #shift ", %%mm0       \n\t"\
00491         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00492         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00493         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00494         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00495         "psrad $" #shift ", %%mm6       \n\t"\
00496         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00497         "movq %%mm2, 8+" #dst "         \n\t"\
00498         "psrad $" #shift ", %%mm4       \n\t"\
00499         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00500         "movq %%mm4, 16+" #dst "        \n\t"\
00501 
00502 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00503         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00504         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00505         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00506         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00507         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00508         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00509         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00510         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00511         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00512         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00513         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00514         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00515         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00516         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00517         #rounder ", %%mm4               \n\t"\
00518         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00519         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00520         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00521         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00522         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00523         #rounder ", %%mm0               \n\t"\
00524         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00525         "paddd %%mm0, %%mm0             \n\t" \
00526         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00527         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00528         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00529         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00530         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00531         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00532         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00533         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00534         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00535         "psrad $" #shift ", %%mm7       \n\t"\
00536         "psrad $" #shift ", %%mm4       \n\t"\
00537         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00538         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00539         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00540         "psrad $" #shift ", %%mm1       \n\t"\
00541         "psrad $" #shift ", %%mm2       \n\t"\
00542         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00543         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00544         "movq %%mm7, " #dst "           \n\t"\
00545         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00546         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00547         "movq %%mm2, 24+" #dst "        \n\t"\
00548         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00549         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00550         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00551         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00552         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00553         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00554         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00555         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00556         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00557         "psrad $" #shift ", %%mm2       \n\t"\
00558         "psrad $" #shift ", %%mm0       \n\t"\
00559         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00560         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00561         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00562         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00563         "psrad $" #shift ", %%mm6       \n\t"\
00564         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00565         "movq %%mm2, 8+" #dst "         \n\t"\
00566         "psrad $" #shift ", %%mm4       \n\t"\
00567         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00568         "movq %%mm4, 16+" #dst "        \n\t"\
00569 
00570 //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
00571 DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
00572 Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
00573 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
00574 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
00575 
00576 #undef IDCT
00577 #define IDCT(src0, src4, src1, src5, dst, shift) \
00578         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00579         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00580         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00581         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00582         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00583         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00584         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00585         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00586         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00587         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00588         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00589         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00590         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00591         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00592         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00593         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00594         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00595         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00596         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
00597         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
00598         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00599         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00600         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00601         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
00602         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
00603         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00604         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00605         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00606         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00607         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
00608         "psrad $" #shift ", %%mm7       \n\t"\
00609         "psrad $" #shift ", %%mm4       \n\t"\
00610         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00611         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00612         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00613         "psrad $" #shift ", %%mm0       \n\t"\
00614         "psrad $" #shift ", %%mm2       \n\t"\
00615         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
00616         "movd %%mm7, " #dst "           \n\t"\
00617         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00618         "movd %%mm0, 16+" #dst "        \n\t"\
00619         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00620         "movd %%mm2, 96+" #dst "        \n\t"\
00621         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00622         "movd %%mm4, 112+" #dst "       \n\t"\
00623         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
00624         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00625         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00626         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00627         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00628         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00629         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00630         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00631         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00632         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00633         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00634         "psrad $" #shift ", %%mm2       \n\t"\
00635         "psrad $" #shift ", %%mm5       \n\t"\
00636         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00637         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
00638         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00639         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00640         "psrad $" #shift ", %%mm6       \n\t"\
00641         "psrad $" #shift ", %%mm4       \n\t"\
00642         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00643         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00644         "movd %%mm2, 32+" #dst "        \n\t"\
00645         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
00646         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00647         "movd %%mm6, 48+" #dst "        \n\t"\
00648         "movd %%mm4, 64+" #dst "        \n\t"\
00649         "movd %%mm5, 80+" #dst "        \n\t"
00650 
00651 
00652 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00653 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00654 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00655 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00656 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00657         "jmp 9f                         \n\t"
00658 
00659         "# .p2align 4                   \n\t"\
00660         "4:                             \n\t"
00661 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
00662 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
00663 
00664 #undef IDCT
00665 #define IDCT(src0, src4, src1, src5, dst, shift) \
00666         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00667         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00668         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00669         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00670         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00671         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00672         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00673         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00674         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00675         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00676         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00677         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00678         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00679         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00680         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00681         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
00682         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
00683         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00684         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00685         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
00686         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00687         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
00688         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00689         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00690         "psrad $" #shift ", %%mm1       \n\t"\
00691         "psrad $" #shift ", %%mm4       \n\t"\
00692         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00693         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00694         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00695         "psrad $" #shift ", %%mm0       \n\t"\
00696         "psrad $" #shift ", %%mm2       \n\t"\
00697         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
00698         "movd %%mm1, " #dst "           \n\t"\
00699         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00700         "movd %%mm0, 16+" #dst "        \n\t"\
00701         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00702         "movd %%mm2, 96+" #dst "        \n\t"\
00703         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00704         "movd %%mm4, 112+" #dst "       \n\t"\
00705         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
00706         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00707         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00708         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00709         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00710         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00711         "psrad $" #shift ", %%mm2       \n\t"\
00712         "psrad $" #shift ", %%mm5       \n\t"\
00713         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
00714         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00715         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
00716         "psrad $" #shift ", %%mm6       \n\t"\
00717         "psrad $" #shift ", %%mm1       \n\t"\
00718         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00719         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00720         "movd %%mm2, 32+" #dst "        \n\t"\
00721         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
00722         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00723         "movd %%mm6, 48+" #dst "        \n\t"\
00724         "movd %%mm1, 64+" #dst "        \n\t"\
00725         "movd %%mm5, 80+" #dst "        \n\t"
00726 
00727 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00728 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00729 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00730 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00731 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00732         "jmp 9f                         \n\t"
00733 
00734         "# .p2align 4                   \n\t"\
00735         "6:                             \n\t"
00736 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
00737 
00738 #undef IDCT
00739 #define IDCT(src0, src4, src1, src5, dst, shift) \
00740         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00741         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00742         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00743         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00744         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00745         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00746         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00747         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00748         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00749         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00750         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
00751         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00752         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
00753         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00754         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00755         "psrad $" #shift ", %%mm1       \n\t"\
00756         "psrad $" #shift ", %%mm4       \n\t"\
00757         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00758         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00759         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00760         "psrad $" #shift ", %%mm0       \n\t"\
00761         "psrad $" #shift ", %%mm2       \n\t"\
00762         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
00763         "movd %%mm1, " #dst "           \n\t"\
00764         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00765         "movd %%mm0, 16+" #dst "        \n\t"\
00766         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00767         "movd %%mm2, 96+" #dst "        \n\t"\
00768         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00769         "movd %%mm4, 112+" #dst "       \n\t"\
00770         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
00771         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00772         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00773         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00774         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00775         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00776         "psrad $" #shift ", %%mm2       \n\t"\
00777         "psrad $" #shift ", %%mm5       \n\t"\
00778         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
00779         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00780         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
00781         "psrad $" #shift ", %%mm6       \n\t"\
00782         "psrad $" #shift ", %%mm1       \n\t"\
00783         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00784         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00785         "movd %%mm2, 32+" #dst "        \n\t"\
00786         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
00787         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00788         "movd %%mm6, 48+" #dst "        \n\t"\
00789         "movd %%mm1, 64+" #dst "        \n\t"\
00790         "movd %%mm5, 80+" #dst "        \n\t"
00791 
00792 
00793 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00794 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00795 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00796 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00797 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00798         "jmp 9f                         \n\t"
00799 
00800         "# .p2align 4                   \n\t"\
00801         "2:                             \n\t"
00802 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
00803 
00804 #undef IDCT
00805 #define IDCT(src0, src4, src1, src5, dst, shift) \
00806         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00807         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00808         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00809         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00810         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00811         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00812         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00813         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00814         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00815         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00816         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00817         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00818         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00819         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00820         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
00821         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
00822         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00823         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00824         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00825         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00826         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
00827         "psrad $" #shift ", %%mm7       \n\t"\
00828         "psrad $" #shift ", %%mm4       \n\t"\
00829         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00830         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00831         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00832         "psrad $" #shift ", %%mm0       \n\t"\
00833         "psrad $" #shift ", %%mm2       \n\t"\
00834         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
00835         "movd %%mm7, " #dst "           \n\t"\
00836         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00837         "movd %%mm0, 16+" #dst "        \n\t"\
00838         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00839         "movd %%mm2, 96+" #dst "        \n\t"\
00840         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00841         "movd %%mm4, 112+" #dst "       \n\t"\
00842         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
00843         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00844         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00845         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00846         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00847         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00848         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00849         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00850         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00851         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00852         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00853         "psrad $" #shift ", %%mm2       \n\t"\
00854         "psrad $" #shift ", %%mm5       \n\t"\
00855         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00856         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
00857         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00858         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00859         "psrad $" #shift ", %%mm6       \n\t"\
00860         "psrad $" #shift ", %%mm4       \n\t"\
00861         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00862         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00863         "movd %%mm2, 32+" #dst "        \n\t"\
00864         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
00865         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00866         "movd %%mm6, 48+" #dst "        \n\t"\
00867         "movd %%mm4, 64+" #dst "        \n\t"\
00868         "movd %%mm5, 80+" #dst "        \n\t"
00869 
00870 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00871 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00872 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00873 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00874 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00875         "jmp 9f                         \n\t"
00876 
00877         "# .p2align 4                   \n\t"\
00878         "3:                             \n\t"
00879 #undef IDCT
00880 #define IDCT(src0, src4, src1, src5, dst, shift) \
00881         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00882         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00883         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00884         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00885         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00886         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00887         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00888         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00889         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00890         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00891         "movq 64(%2), %%mm3             \n\t"\
00892         "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00893         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00894         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00895         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00896         "psrad $" #shift ", %%mm7       \n\t"\
00897         "psrad $" #shift ", %%mm4       \n\t"\
00898         "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
00899         "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00900         "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
00901         "psrad $" #shift ", %%mm0       \n\t"\
00902         "psrad $" #shift ", %%mm1       \n\t"\
00903         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
00904         "movd %%mm7, " #dst "           \n\t"\
00905         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00906         "movd %%mm0, 16+" #dst "        \n\t"\
00907         "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
00908         "movd %%mm1, 96+" #dst "        \n\t"\
00909         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00910         "movd %%mm4, 112+" #dst "       \n\t"\
00911         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00912         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00913         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00914         "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
00915         "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
00916         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00917         "psrad $" #shift ", %%mm1       \n\t"\
00918         "psrad $" #shift ", %%mm5       \n\t"\
00919         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00920         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00921         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00922         "psrad $" #shift ", %%mm6       \n\t"\
00923         "psrad $" #shift ", %%mm4       \n\t"\
00924         "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
00925         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00926         "movd %%mm1, 32+" #dst "        \n\t"\
00927         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
00928         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00929         "movd %%mm6, 48+" #dst "        \n\t"\
00930         "movd %%mm4, 64+" #dst "        \n\t"\
00931         "movd %%mm5, 80+" #dst "        \n\t"
00932 
00933 
00934 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00935 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00936 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00937 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00938 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00939         "jmp 9f                         \n\t"
00940 
00941         "# .p2align 4                   \n\t"\
00942         "5:                             \n\t"
00943 #undef IDCT
00944 #define IDCT(src0, src4, src1, src5, dst, shift) \
00945         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00946         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00947         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00948         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00949         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00950         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00951         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00952         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00953         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00954         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00955         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00956         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00957         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00958         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00959         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
00960         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
00961         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
00962         "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
00963         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
00964         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00965         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
00966         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00967         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
00968         "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00969         "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00970         "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
00971         "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
00972         "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
00973         "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
00974         "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
00975         "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
00976         "psrad $" #shift ", %%mm4       \n\t"\
00977         "psrad $" #shift ", %%mm7       \n\t"\
00978         "psrad $" #shift ", %%mm3       \n\t"\
00979         "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
00980         "movq %%mm4, " #dst "           \n\t"\
00981         "psrad $" #shift ", %%mm0       \n\t"\
00982         "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
00983         "movq %%mm0, 16+" #dst "        \n\t"\
00984         "movq %%mm0, 96+" #dst "        \n\t"\
00985         "movq %%mm4, 112+" #dst "       \n\t"\
00986         "psrad $" #shift ", %%mm5       \n\t"\
00987         "psrad $" #shift ", %%mm6       \n\t"\
00988         "psrad $" #shift ", %%mm2       \n\t"\
00989         "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00990         "movq %%mm5, 32+" #dst "        \n\t"\
00991         "psrad $" #shift ", %%mm1       \n\t"\
00992         "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00993         "movq %%mm6, 48+" #dst "        \n\t"\
00994         "movq %%mm6, 64+" #dst "        \n\t"\
00995         "movq %%mm5, 80+" #dst "        \n\t"
00996 
00997 
00998 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00999 IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01000 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01001 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01002 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01003         "jmp 9f                         \n\t"
01004 
01005 
01006         "# .p2align 4                   \n\t"\
01007         "1:                             \n\t"
01008 #undef IDCT
01009 #define IDCT(src0, src4, src1, src5, dst, shift) \
01010         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
01011         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
01012         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
01013         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
01014         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01015         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
01016         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01017         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
01018         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
01019         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
01020         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
01021         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01022         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
01023         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
01024         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
01025         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
01026         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01027         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
01028         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
01029         "movq 64(%2), %%mm1             \n\t"\
01030         "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
01031         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
01032         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
01033         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
01034         "psrad $" #shift ", %%mm7       \n\t"\
01035         "psrad $" #shift ", %%mm4       \n\t"\
01036         "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
01037         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
01038         "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
01039         "psrad $" #shift ", %%mm0       \n\t"\
01040         "psrad $" #shift ", %%mm3       \n\t"\
01041         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
01042         "movd %%mm7, " #dst "           \n\t"\
01043         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
01044         "movd %%mm0, 16+" #dst "        \n\t"\
01045         "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
01046         "movd %%mm3, 96+" #dst "        \n\t"\
01047         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
01048         "movd %%mm4, 112+" #dst "       \n\t"\
01049         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
01050         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
01051         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
01052         "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
01053         "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
01054         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
01055         "psrad $" #shift ", %%mm3       \n\t"\
01056         "psrad $" #shift ", %%mm5       \n\t"\
01057         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
01058         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
01059         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
01060         "psrad $" #shift ", %%mm6       \n\t"\
01061         "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
01062         "movd %%mm3, 32+" #dst "        \n\t"\
01063         "psrad $" #shift ", %%mm4       \n\t"\
01064         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
01065         "movd %%mm6, 48+" #dst "        \n\t"\
01066         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
01067         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
01068         "movd %%mm4, 64+" #dst "        \n\t"\
01069         "movd %%mm5, 80+" #dst "        \n\t"
01070 
01071 
01072 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
01073 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01074 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01075 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01076 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01077         "jmp 9f                         \n\t"
01078 
01079 
01080         "# .p2align 4                   \n\t"
01081         "7:                             \n\t"
01082 #undef IDCT
01083 #define IDCT(src0, src4, src1, src5, dst, shift) \
01084         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
01085         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
01086         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01087         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
01088         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01089         "psrad $" #shift ", %%mm4       \n\t"\
01090         "psrad $" #shift ", %%mm0       \n\t"\
01091         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
01092         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
01093         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01094         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
01095         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01096         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
01097         "psrad $" #shift ", %%mm1       \n\t"\
01098         "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
01099         "movq %%mm4, " #dst "           \n\t"\
01100         "psrad $" #shift ", %%mm2       \n\t"\
01101         "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
01102         "movq %%mm0, 16+" #dst "        \n\t"\
01103         "movq %%mm0, 96+" #dst "        \n\t"\
01104         "movq %%mm4, 112+" #dst "       \n\t"\
01105         "movq %%mm0, 32+" #dst "        \n\t"\
01106         "movq %%mm4, 48+" #dst "        \n\t"\
01107         "movq %%mm4, 64+" #dst "        \n\t"\
01108         "movq %%mm0, 80+" #dst "        \n\t"
01109 
01110 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
01111 IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01112 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01113 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01114 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01115 
01116 
01117 #endif
01118 
01119 /*
01120 Input
01121  00 40 04 44 20 60 24 64
01122  10 30 14 34 50 70 54 74
01123  01 41 03 43 21 61 23 63
01124  11 31 13 33 51 71 53 73
01125  02 42 06 46 22 62 26 66
01126  12 32 16 36 52 72 56 76
01127  05 45 07 47 25 65 27 67
01128  15 35 17 37 55 75 57 77
01129 
01130 Temp
01131  00 04 10 14 20 24 30 34
01132  40 44 50 54 60 64 70 74
01133  01 03 11 13 21 23 31 33
01134  41 43 51 53 61 63 71 73
01135  02 06 12 16 22 26 32 36
01136  42 46 52 56 62 66 72 76
01137  05 07 15 17 25 27 35 37
01138  45 47 55 57 65 67 75 77
01139 */
01140 
01141 "9: \n\t"
01142                 :: "r" (block), "r" (temp), "r" (coeffs)
01143                 : "%eax"
01144         );
01145 }
01146 
01147 void ff_simple_idct_mmx(int16_t *block)
01148 {
01149     idct(block);
01150 }
01151 
01152 //FIXME merge add/put into the idct
01153 
01154 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01155 {
01156     idct(block);
01157     ff_put_pixels_clamped_mmx(block, dest, line_size);
01158 }
01159 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01160 {
01161     idct(block);
01162     ff_add_pixels_clamped_mmx(block, dest, line_size);
01163 }
Generated on Sun Apr 22 2012 21:54:06 for Libav by doxygen 1.7.1