Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

X86.c

Go to the documentation of this file.
00001 /*
00002 
00003     x86 specific optimized assembler dsp routines
00004     Copyright (C) 2001-2004 Jussi Laako
00005 
00006     This program is free software; you can redistribute it and/or modify
00007     it under the terms of the GNU General Public License as published by
00008     the Free Software Foundation; either version 2 of the License, or
00009     (at your option) any later version.
00010 
00011     This program is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014     GNU General Public License for more details.
00015 
00016     You should have received a copy of the GNU General Public License
00017     along with this program; if not, write to the Free Software
00018     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019 
00020 */
00021 
00022 
00023 #ifdef DSP_X86
00024 
00025 
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include <limits.h>
00029 #include <math.h>
00030 #include <float.h>
00031 
00032 #include "dsp/X86.h"
00033 
00034 
00035 #ifndef DSP_X86_64
00036 static char cpCPUid[13];
00037 #endif
00038 
00039 
00040 #ifdef __cplusplus
00041 extern "C"
00042 {
00043 #endif
00044 
00045 
00046 #ifndef DSP_X86_64
00047 const char *dsp_x86_cpuid ()
00048 {
00049     unsigned int *ipCPUid = (unsigned int *) cpCPUid;
00050     
00051     X86_ASM (
00052         "pushl %%ebx\n\t" \
00053         "xorl %%eax, %%eax\n\t" \
00054         "cpuid\n\t" \
00055         "movl %%ebx, %0\n\t" \
00056         "movl %%ecx, %2\n\t" \
00057         "movl %%edx, %1\n\t" \
00058         "popl %%ebx\n\t" \
00059         : "=m" (ipCPUid[0]),
00060           "=m" (ipCPUid[1]),
00061           "=m" (ipCPUid[2])
00062         :
00063         : "eax", "ecx", "edx", "memory");
00064     cpCPUid[12] = '\0';
00065 
00066     return cpCPUid;
00067 }
00068 
00069 
00070 unsigned int dsp_x86_features ()
00071 {
00072     unsigned int uiFeatures = 0;
00073     
00074     X86_ASM (
00075         "pushl %%ebx\n\t" \
00076         "movl $1, %%eax\n\t" \
00077         "cpuid\n\t" \
00078         "movl %%edx, %0\n\t" \
00079         "popl %%ebx\n\t" \
00080         : "=m" (uiFeatures)
00081         :
00082         : "eax", "ecx", "edx", "memory");
00083     
00084     return uiFeatures;
00085 }
00086 
00087 
00088 unsigned int dsp_x86_amd_features ()
00089 {
00090     unsigned int uiFunction = 0x80000001;
00091     unsigned int uiFeatures = 0;
00092     
00093     X86_ASM (
00094         "pushl %%ebx\n\t" \
00095         "movl %1, %%eax\n\t" \
00096         "cpuid\n\t" \
00097         "movl %%edx, %0\n\t" \
00098         "popl %%ebx\n\t" \
00099         : "=m" (uiFeatures)
00100         : "m" (uiFunction)
00101         : "eax", "ecx", "edx", "memory");
00102     
00103     return uiFeatures;
00104 }
00105 #endif
00106 
00107 
00108 extern int dsp_x86_have_e3dnow ()
00109 {
00110     #ifndef DSP_X86_64
00111     unsigned int uiExtSup = 0;
00112     unsigned int uiFeatures;
00113 
00114     X86_ASM (
00115         "pushl %%ebx\n\t" \
00116         "movl $0x80000000, %%eax\n\t" \
00117         "cpuid\n\t" \
00118         "cmpl $0x80000001, %%eax\n\t" \
00119         "jl have3dnowxit\n\t" \
00120         "movl $1, %0\n\t" \
00121         "have3dnowxit:\n\t" \
00122         "popl %%ebx\n\t"
00123         : "=m" (uiExtSup)
00124         :
00125         : "eax", "ecx", "edx", "memory");
00126     if (uiExtSup)
00127     {
00128         uiFeatures = dsp_x86_amd_features();
00129         if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
00130             return 1;
00131     }
00132     return 0;
00133     #else
00134     return 1;
00135     #endif
00136 }
00137 
00138 
00139 extern int dsp_x86_have_sse2 ()
00140 {
00141     #ifndef DSP_X86_64
00142     unsigned int uiFeatures;
00143     
00144     uiFeatures = dsp_x86_features();
00145     if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
00146         return 1;
00147     return 0;
00148     #else
00149     return 1;
00150     #endif
00151 }
00152 
00153 
00154 void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
00155 {
00156     int iStartIdx;
00157     int iDataCntr;
00158     int iDataCount;
00159     stpm64 m64pDest = (stpm64) fpDest;
00160     stpm64 m64pSrc = (stpm64) fpSrc;
00161     
00162     iStartIdx = 0;
00163     X86_ASM (
00164         "prefetchnta %0\n\t" \
00165         "prefetchnta %1\n\t" \
00166         "prefetchnta %2\n\t" \
00167         "prefetchnta %3\n\t"
00168         :
00169         : "m" (m64pSrc[0]),
00170           "m" (m64pSrc[8]),
00171           "m" (m64pSrc[16]),
00172           "m" (m64pSrc[24]));
00173     iDataCount = ((iDataLength & 0xfffffff0) >> 1);
00174     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00175     {
00176         X86_ASM (
00177             "prefetchnta %16\n\t" \
00178             "movq %8, %%mm0\n\t" \
00179             "movq %9, %%mm1\n\t" \
00180             "movq %10, %%mm2\n\t" \
00181             "movq %11, %%mm3\n\t" \
00182             "movq %12, %%mm4\n\t" \
00183             "movq %13, %%mm5\n\t" \
00184             "movq %14, %%mm6\n\t" \
00185             "movq %15, %%mm7\n\t" \
00186             "movntq %%mm0, %0\n\t" \
00187             "movntq %%mm1, %1\n\t" \
00188             "movntq %%mm2, %2\n\t" \
00189             "movntq %%mm3, %3\n\t" \
00190             "movntq %%mm4, %4\n\t" \
00191             "movntq %%mm5, %5\n\t" \
00192             "movntq %%mm6, %6\n\t" \
00193             "movntq %%mm7, %7\n\t"
00194             : "=m" (m64pDest[iDataCntr]),
00195               "=m" (m64pDest[iDataCntr + 1]),
00196               "=m" (m64pDest[iDataCntr + 2]),
00197               "=m" (m64pDest[iDataCntr + 3]),
00198               "=m" (m64pDest[iDataCntr + 4]),
00199               "=m" (m64pDest[iDataCntr + 5]),
00200               "=m" (m64pDest[iDataCntr + 6]),
00201               "=m" (m64pDest[iDataCntr + 7])
00202             : "m" (m64pSrc[iDataCntr]),
00203               "m" (m64pSrc[iDataCntr + 1]),
00204               "m" (m64pSrc[iDataCntr + 2]),
00205               "m" (m64pSrc[iDataCntr + 3]),
00206               "m" (m64pSrc[iDataCntr + 4]),
00207               "m" (m64pSrc[iDataCntr + 5]),
00208               "m" (m64pSrc[iDataCntr + 6]),
00209               "m" (m64pSrc[iDataCntr + 7]),
00210               "m" (m64pSrc[iDataCntr + 32])
00211             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00212     }
00213     iStartIdx = iDataCount;
00214     iDataCount = ((iDataLength & 0xfffffffe) >> 1);
00215     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00216     {
00217         X86_ASM (
00218             "prefetchnta %2\n\t" \
00219             "movq %1, %%mm0\n\t" \
00220             "movntq %%mm0, %0\n\t"
00221             : "=m" (m64pDest[iDataCntr])
00222             : "m" (m64pSrc[iDataCntr]),
00223               "m" (m64pSrc[iDataCntr + 32])
00224             : "mm0", "memory");
00225     }
00226     if (iDataLength & 0x1)
00227     {
00228         X86_ASM (
00229             "movd %1, %%mm0\n\t" \
00230             "movd %%mm0, %0\n\t"
00231             : "=m" (fpDest[iDataLength - 1])
00232             : "m" (fpSrc[iDataLength - 1])
00233             : "mm0", "memory");
00234     }
00235     X86_ASM (
00236         "femms\n\t" \
00237         "sfence\n\t");
00238 }
00239 
00240 
00241 void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
00242 {
00243     int iStartIdx;
00244     int iDataCntr;
00245     int iDataCount;
00246     
00247     iStartIdx = 0;
00248     X86_ASM (
00249         "prefetchnta %0\n\t" \
00250         "prefetchnta %1\n\t" \
00251         "prefetchnta %2\n\t" \
00252         "prefetchnta %3\n\t"
00253         :
00254         : "m" (dpSrc[0]),
00255           "m" (dpSrc[8]),
00256           "m" (dpSrc[16]),
00257           "m" (dpSrc[24]));
00258     iDataCount = (iDataLength & 0xfffffff8);
00259     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00260     {
00261         X86_ASM (
00262             "prefetchnta %16\n\t" \
00263             "movq %8, %%mm0\n\t" \
00264             "movq %9, %%mm1\n\t" \
00265             "movq %10, %%mm2\n\t" \
00266             "movq %11, %%mm3\n\t" \
00267             "movq %12, %%mm4\n\t" \
00268             "movq %13, %%mm5\n\t" \
00269             "movq %14, %%mm6\n\t" \
00270             "movq %15, %%mm7\n\t" \
00271             "movntq %%mm0, %0\n\t" \
00272             "movntq %%mm1, %1\n\t" \
00273             "movntq %%mm2, %2\n\t" \
00274             "movntq %%mm3, %3\n\t" \
00275             "movntq %%mm4, %4\n\t" \
00276             "movntq %%mm5, %5\n\t" \
00277             "movntq %%mm6, %6\n\t" \
00278             "movntq %%mm7, %7\n\t"
00279             : "=m" (dpDest[iDataCntr]),
00280               "=m" (dpDest[iDataCntr + 1]),
00281               "=m" (dpDest[iDataCntr + 2]),
00282               "=m" (dpDest[iDataCntr + 3]),
00283               "=m" (dpDest[iDataCntr + 4]),
00284               "=m" (dpDest[iDataCntr + 5]),
00285               "=m" (dpDest[iDataCntr + 6]),
00286               "=m" (dpDest[iDataCntr + 7])
00287             : "m" (dpSrc[iDataCntr]),
00288               "m" (dpSrc[iDataCntr + 1]),
00289               "m" (dpSrc[iDataCntr + 2]),
00290               "m" (dpSrc[iDataCntr + 3]),
00291               "m" (dpSrc[iDataCntr + 4]),
00292               "m" (dpSrc[iDataCntr + 5]),
00293               "m" (dpSrc[iDataCntr + 6]),
00294               "m" (dpSrc[iDataCntr + 7]),
00295               "m" (dpSrc[iDataCntr + 32])
00296             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00297     }
00298     iStartIdx = iDataCount;
00299     iDataCount = iDataLength;
00300     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00301     {
00302         X86_ASM (
00303             "prefetchnta %2\n\t" \
00304             "movq %1, %%mm0\n\t" \
00305             "movntq %%mm0, %0\n\t"
00306             : "=m" (dpDest[iDataCntr])
00307             : "m" (dpSrc[iDataCntr]),
00308               "m" (dpSrc[iDataCntr + 32])
00309             : "mm0", "memory");
00310     }
00311     X86_ASM (
00312         "femms\n\t" \
00313         "sfence\n\t");
00314 }
00315 
00316 
00317 void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
00318 {
00319     int iDataCntr;
00320     int iDataCount;
00321     stpm64 m64pVect = (stpm64) fpVect;
00322     stm64 m64Src;
00323 
00324     m64Src.f[0] = m64Src.f[1] = fSrc;
00325     iDataCount = (iDataLength >> 1);
00326     X86_ASM (
00327         "movq %0, %%mm1\n\t"
00328         :
00329         : "m" (m64Src)
00330         : "mm1", "memory");
00331     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00332     {
00333         X86_ASM (
00334             "movq %1, %%mm0\n\t" \
00335             "pfadd %%mm1, %%mm0\n\t" \
00336             "movntq %%mm0, %0\n\t"
00337             : "=m" (m64pVect[iDataCntr])
00338             : "m0" (m64pVect[iDataCntr])
00339             : "mm0", "mm1", "memory");
00340     }
00341     if (iDataLength & 0x1)
00342     {
00343         X86_ASM (
00344             "movd %1, %%mm0\n\t" \
00345             "pfadd %%mm1, %%mm0\n\t" \
00346             "movd %%mm0, %0\n\t"
00347             : "=m" (fpVect[iDataLength - 1])
00348             : "m0" (fpVect[iDataLength - 1])
00349             : "mm0", "mm1", "memory");
00350     }
00351     X86_ASM (
00352         "femms\n\t" \
00353         "sfence\n\t");
00354 }
00355 
00356 
00357 void dsp_x86_sse_addf (float *fpVect, float fSrc, int iDataLength)
00358 {
00359     int iDataCntr;
00360     
00361     X86_ASM (
00362         "movss %0, %%xmm1\n\t"
00363         :
00364         : "m" (fSrc)
00365         : "xmm1", "memory");
00366     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00367     {
00368         X86_ASM (
00369             "movss %1, %%xmm0\n\t" \
00370             "addss %%xmm1, %%xmm0\n\t" \
00371             "movss %%xmm0, %0\n\t"
00372             : "=m" (fpVect[iDataCntr])
00373             : "m0" (fpVect[iDataCntr])
00374             : "xmm0", "xmm1", "memory");
00375     }
00376 }
00377 
00378 
00379 void dsp_x86_sse_add (double *dpVect, double dSrc, int iDataLength)
00380 {
00381     int iDataCntr;
00382     
00383     X86_ASM (
00384         "movsd %0, %%xmm1\n\t"
00385         :
00386         : "m" (dSrc)
00387         : "xmm1", "memory");
00388     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00389     {
00390         X86_ASM (
00391             "movsd %1, %%xmm0\n\t" \
00392             "addsd %%xmm1, %%xmm0\n\t" \
00393             "movsd %%xmm0, %0\n\t"
00394             : "=m" (dpVect[iDataCntr])
00395             : "m0" (dpVect[iDataCntr])
00396             : "xmm0", "xmm1", "memory");
00397     }
00398 }
00399 
00400 
00401 void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
00402 {
00403     int iDataCntr;
00404     int iDataCount;
00405     stpm64 m64pVect = (stpm64) fpVect;
00406     stm64 m64Src;
00407 
00408     m64Src.f[0] = m64Src.f[1] = fSrc;
00409     iDataCount = (iDataLength >> 1);
00410     X86_ASM (
00411         "movq %0, %%mm1\n\t"
00412         :
00413         : "m" (m64Src)
00414         : "mm1", "memory");
00415     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00416     {
00417         X86_ASM (
00418             "movq %1, %%mm0\n\t" \
00419             "pfmul %%mm1, %%mm0\n\t" \
00420             "movntq %%mm0, %0\n\t"
00421             : "=m" (m64pVect[iDataCntr])
00422             : "m0" (m64pVect[iDataCntr])
00423             : "mm0", "mm1", "memory");
00424     }
00425     if (iDataLength & 0x1)
00426     {
00427         X86_ASM (
00428             "movd %1, %%mm0\n\t" \
00429             "pfmul %%mm1, %%mm0\n\t" \
00430             "movd %%mm0, %0\n\t"
00431             : "=m" (fpVect[iDataLength - 1])
00432             : "m0" (fpVect[iDataLength - 1])
00433             : "mm0", "mm1", "memory");
00434     }
00435     X86_ASM (
00436         "femms\n\t" \
00437         "sfence\n\t");
00438 }
00439 
00440 
00441 void dsp_x86_sse_mulf (float *fpVect, float fSrc, int iDataLength)
00442 {
00443     int iDataCntr;
00444 
00445     X86_ASM (
00446         "movss %0, %%xmm1\n\t"
00447         :
00448         : "m" (fSrc)
00449         : "xmm1", "memory");
00450     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00451     {
00452         X86_ASM (
00453             "movss %1, %%xmm0\n\t" \
00454             "mulss %%xmm1, %%xmm0\n\t" \
00455             "movss %%xmm0, %0\n\t"
00456             : "=m" (fpVect[iDataCntr])
00457             : "m0" (fpVect[iDataCntr])
00458             : "xmm0", "xmm1", "memory");
00459     }
00460 }
00461 
00462 
00463 void dsp_x86_sse_mul (double *dpVect, double dSrc, int iDataLength)
00464 {
00465     int iDataCntr;
00466     
00467     X86_ASM (
00468         "movsd %0, %%xmm1\n\t"
00469         :
00470         : "m" (dSrc)
00471         : "xmm1", "memory");
00472     for (iDataCntr = 0; iDataCntr <iDataLength; iDataCntr++)
00473     {
00474         X86_ASM (
00475             "movsd %1, %%xmm0\n\t" \
00476             "mulsd %%xmm1, %%xmm0\n\t" \
00477             "movsd %%xmm0, %0\n\t"
00478             : "=m" (dpVect[iDataCntr])
00479             : "m0" (dpVect[iDataCntr])
00480             : "xmm0", "xmm1", "memory");
00481     }
00482 }
00483 
00484 
00485 void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2, 
00486     int iDataLength)
00487 {
00488     int iDataCntr;
00489     int iDataCount;
00490     stpm64 m64pDest = (stpm64) fpDest;
00491     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00492     stm64 m64Src2;
00493 
00494     m64Src2.f[0] = m64Src2.f[1] = fSrc2;
00495     iDataCount = (iDataLength >> 1);
00496     X86_ASM (
00497         "movq %0, %%mm1\n\t"
00498         :
00499         : "m" (m64Src2)
00500         : "mm1", "memory");
00501     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00502     {
00503         X86_ASM (
00504             "movq %1, %%mm0\n\t" \
00505             "pfmul %%mm1, %%mm0\n\t" \
00506             "movntq %%mm0, %0\n\t"
00507             : "=m" (m64pDest[iDataCntr])
00508             : "m" (m64pSrc1[iDataCntr])
00509             : "mm0", "mm1", "memory");
00510     }
00511     if (iDataLength & 0x1)
00512     {
00513         X86_ASM (
00514             "movd %1, %%mm0\n\t" \
00515             "pfmul %%mm1, %%mm0\n\t" \
00516             "movd %%mm0, %0\n\t"
00517             : "=m" (fpDest[iDataLength - 1])
00518             : "m" (fpSrc1[iDataLength - 1])
00519             : "mm0", "mm1", "memory");
00520     }
00521     X86_ASM (
00522         "femms\n\t" \
00523         "sfence\n\t");
00524 }
00525 
00526 
00527 void dsp_x86_sse_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
00528     int iDataLength)
00529 {
00530     int iDataCntr;
00531 
00532     X86_ASM (
00533         "movss %0, %%xmm1\n\t"
00534         :
00535         : "m" (fSrc2)
00536         : "xmm1", "memory");
00537     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00538     {
00539         X86_ASM (
00540             "movss %1, %%xmm0\n\t" \
00541             "mulss %%xmm1, %%xmm0\n\t" \
00542             "movss %%xmm0, %0\n\t"
00543             : "=m" (fpDest[iDataCntr])
00544             : "m" (fpSrc1[iDataCntr])
00545             : "xmm0", "xmm1", "memory");
00546     }
00547 }
00548 
00549 
00550 void dsp_x86_sse_mul_nip (double *dpDest, const double *dpSrc1, double dSrc2,
00551     int iDataLength)
00552 {
00553     int iDataCntr;
00554     
00555     X86_ASM (
00556         "movsd %0, %%xmm1\n\t"
00557         :
00558         : "m" (dSrc2)
00559         : "xmm1", "memory");
00560     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00561     {
00562         X86_ASM (
00563             "movsd %1, %%xmm0\n\t" \
00564             "mulsd %%xmm1, %%xmm0\n\t" \
00565             "movsd %%xmm0, %0\n\t"
00566             : "=m" (dpDest[iDataCntr])
00567             : "m" (dpSrc1[iDataCntr])
00568             : "xmm0", "xmm1", "memory");
00569     }
00570 }
00571 
00572 
00573 void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00574 {
00575     int iDataCntr;
00576     int iDataCount;
00577     stpm64 m64pDest = (stpm64) fpDest;
00578     stpm64 m64pSrc = (stpm64) fpSrc;
00579 
00580     iDataCount = (iDataLength >> 1);
00581     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00582     {
00583         X86_ASM (
00584             "movq %1, %%mm0\n\t" \
00585             "movq %2, %%mm1\n\t" \
00586             "pfadd %%mm1, %%mm0\n\t" \
00587             "movntq %%mm0, %0\n\t"
00588             : "=m" (m64pDest[iDataCntr])
00589             : "m0" (m64pDest[iDataCntr]),
00590               "m" (m64pSrc[iDataCntr])
00591             : "mm0", "mm1", "memory");
00592     }
00593     if (iDataLength & 0x1)
00594     {
00595         X86_ASM (
00596             "movd %1, %%mm0\n\t" \
00597             "movd %2, %%mm1\n\t" \
00598             "pfadd %%mm1, %%mm0\n\t" \
00599             "movd %%mm0, %0\n\t"
00600             : "=m" (fpDest[iDataLength - 1])
00601             : "m0" (fpDest[iDataLength - 1]),
00602               "m" (fpSrc[iDataLength - 1])
00603             : "mm0", "mm1", "memory");
00604     }
00605     X86_ASM (
00606         "femms\n\t" \
00607         "sfence\n\t");
00608 }
00609 
00610 
00611 void dsp_x86_sse_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00612 {
00613     int iDataCntr;
00614     
00615     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00616     {
00617         X86_ASM (
00618             "movss %1, %%xmm0\n\t" \
00619             "addss %2, %%xmm0\n\t" \
00620             "movss %%xmm0, %0\n\t"
00621             : "=m" (fpDest[iDataCntr])
00622             : "m0" (fpDest[iDataCntr]),
00623               "m" (fpSrc[iDataCntr])
00624             : "xmm0", "memory");
00625     }
00626 }
00627 
00628 
00629 void dsp_x86_sse_add2 (double *dpDest, const double *dpSrc, int iDataLength)
00630 {
00631     int iDataCntr;
00632     
00633     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00634     {
00635         X86_ASM (
00636             "movsd %1, %%xmm0\n\t" \
00637             "addsd %2, %%xmm0\n\t" \
00638             "movsd %%xmm0, %0\n\t"
00639             : "=m" (dpDest[iDataCntr])
00640             : "m0" (dpDest[iDataCntr]),
00641               "m" (dpSrc[iDataCntr])
00642             : "xmm0", "memory");
00643     }
00644 }
00645 
00646 
00647 void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00648 {
00649     int iDataCntr;
00650     int iDataCount;
00651     stpm64 m64pDest = (stpm64) fpDest;
00652     stpm64 m64pSrc = (stpm64) fpSrc;
00653 
00654     iDataCount = (iDataLength >> 1);
00655     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00656     {
00657         X86_ASM (
00658             "movq %1, %%mm0\n\t" \
00659             "movq %2, %%mm1\n\t" \
00660             "pfmul %%mm1, %%mm0\n\t" \
00661             "movntq %%mm0, %0\n\t"
00662             : "=m" (m64pDest[iDataCntr])
00663             : "m0" (m64pDest[iDataCntr]),
00664               "m" (m64pSrc[iDataCntr])
00665             : "mm0", "mm1", "memory");
00666     }
00667     if (iDataLength & 0x1)
00668     {
00669         X86_ASM (
00670             "movd %1, %%mm0\n\t" \
00671             "movd %2, %%mm1\n\t" \
00672             "pfmul %%mm1, %%mm0\n\t" \
00673             "movd %%mm0, %0\n\t"
00674             : "=m" (fpDest[iDataLength - 1])
00675             : "m0" (fpDest[iDataLength - 1]),
00676               "m" (fpSrc[iDataLength - 1])
00677             : "mm0", "mm1", "memory");
00678     }
00679     X86_ASM (
00680         "femms\n\t" \
00681         "sfence\n\t");
00682 }
00683 
00684 
00685 void dsp_x86_sse_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00686 {
00687     int iDataCntr;
00688     
00689     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00690     {
00691         X86_ASM (
00692             "movss %1, %%xmm0\n\t" \
00693             "mulss %2, %%xmm0\n\t" \
00694             "movss %%xmm0, %0\n\t"
00695             : "=m" (fpDest[iDataCntr])
00696             : "m0" (fpDest[iDataCntr]),
00697               "m" (fpSrc[iDataCntr])
00698             : "xmm0", "memory");
00699     }
00700 }
00701 
00702 
00703 void dsp_x86_sse_mul2 (double *dpDest, const double *dpSrc, int iDataLength)
00704 {
00705     int iDataCntr;
00706     
00707     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00708     {
00709         X86_ASM (
00710             "movsd %1, %%xmm0\n\t" \
00711             "mulsd %2, %%xmm0\n\t" \
00712             "movsd %%xmm0, %0\n\t"
00713             : "=m" (dpDest[iDataCntr])
00714             : "m0" (dpDest[iDataCntr]),
00715               "m" (dpSrc[iDataCntr])
00716             : "xmm0", "memory");
00717     }
00718 }
00719 
00720 
00721 void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1, 
00722     const float *fpSrc2, int iDataLength)
00723 {
00724     int iDataCntr;
00725     int iDataCount;
00726     stpm64 m64pDest = (stpm64) fpDest;
00727     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00728     stpm64 m64pSrc2 = (stpm64) fpSrc2;
00729 
00730     iDataCount = (iDataLength >> 1);
00731     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00732     {
00733         X86_ASM (
00734             "movq %1, %%mm0\n\t" \
00735             "movq %2, %%mm1\n\t" \
00736             "pfadd %%mm1, %%mm0\n\t" \
00737             "movntq %%mm0, %0\n\t"
00738             : "=m" (m64pDest[iDataCntr])
00739             : "m" (m64pSrc1[iDataCntr]),
00740               "m" (m64pSrc2[iDataCntr])
00741             : "mm0", "mm1", "memory");
00742     }
00743     if (iDataLength & 0x1)
00744     {
00745         X86_ASM (
00746             "movd %1, %%mm0\n\t" \
00747             "movd %2, %%mm1\n\t" \
00748             "pfadd %%mm1, %%mm0\n\t" \
00749             "movd %%mm0, %0\n\t"
00750             : "=m" (fpDest[iDataLength - 1])
00751             : "m" (fpSrc1[iDataLength - 1]),
00752               "m" (fpSrc2[iDataLength - 1])
00753             : "mm0", "mm1", "memory");
00754     }
00755     X86_ASM (
00756         "femms\n\t" \
00757         "sfence\n\t");
00758 }
00759 
00760 
00761 void dsp_x86_sse_add3f (float *fpDest, const float *fpSrc1, 
00762     const float *fpSrc2, int iDataLength)
00763 {
00764     int iDataCntr;
00765     
00766     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00767     {
00768         X86_ASM (
00769             "movss %1, %%xmm0\n\t" \
00770             "addss %2, %%xmm0\n\t" \
00771             "movss %%xmm0, %0\n\t"
00772             : "=m" (fpDest[iDataCntr])
00773             : "m" (fpSrc1[iDataCntr]),
00774               "m" (fpSrc2[iDataCntr])
00775             : "xmm0", "memory");
00776     }
00777 }
00778 
00779 
00780 void dsp_x86_sse_add3 (double *dpDest, const double *dpSrc1, 
00781     const double *dpSrc2, int iDataLength)
00782 {
00783     int iDataCntr;
00784     
00785     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00786     {
00787         X86_ASM (
00788             "movsd %1, %%xmm0\n\t" \
00789             "addsd %2, %%xmm0\n\t" \
00790             "movsd %%xmm0, %0\n\t"
00791             : "=m" (dpDest[iDataCntr])
00792             : "m" (dpSrc1[iDataCntr]),
00793               "m" (dpSrc2[iDataCntr])
00794             : "xmm0", "memory");
00795     }
00796 }
00797 
00798 
00799 void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1, 
00800     const float *fpSrc2, int iDataLength)
00801 {
00802     int iDataCntr;
00803     int iDataCount;
00804     stpm64 m64pDest = (stpm64) fpDest;
00805     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00806     stpm64 m64pSrc2 = (stpm64) fpSrc2;
00807 
00808     iDataCount = (iDataLength >> 1);
00809     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00810     {
00811         X86_ASM (
00812             "movq %1, %%mm0\n\t" \
00813             "movq %2, %%mm1\n\t" \
00814             "pfmul %%mm1, %%mm0\n\t" \
00815             "movntq %%mm0, %0\n\t"
00816             : "=m" (m64pDest[iDataCntr])
00817             : "m" (m64pSrc1[iDataCntr]),
00818               "m" (m64pSrc2[iDataCntr])
00819             : "mm0", "mm1", "memory");
00820     }
00821     if (iDataLength & 0x1)
00822     {
00823         X86_ASM (
00824             "movd %1, %%mm0\n\t" \
00825             "movd %2, %%mm1\n\t" \
00826             "pfmul %%mm1, %%mm0\n\t" \
00827             "movd %%mm0, %0\n\t"
00828             : "=m" (fpDest[iDataLength - 1])
00829             : "m" (fpSrc1[iDataLength - 1]),
00830               "m" (fpSrc2[iDataLength - 1])
00831             : "mm0", "mm1", "memory");
00832     }
00833     X86_ASM (
00834         "femms\n\t" \
00835         "sfence\n\t");
00836 }
00837 
00838 
00839 void dsp_x86_sse_mul3f (float *fpDest, const float *fpSrc1, 
00840     const float *fpSrc2, int iDataLength)
00841 {
00842     int iDataCntr;
00843     
00844     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00845     {
00846         X86_ASM (
00847             "movss %1, %%xmm0\n\t" \
00848             "mulss %2, %%xmm0\n\t" \
00849             "movss %%xmm0, %0\n\t"
00850             : "=m" (fpDest[iDataCntr])
00851             : "m" (fpSrc1[iDataCntr]),
00852               "m" (fpSrc2[iDataCntr])
00853             : "xmm0", "memory");
00854     }
00855 }
00856 
00857 
00858 void dsp_x86_sse_mul3 (double *dpDest, const double *dpSrc1, 
00859     const double *dpSrc2, int iDataLength)
00860 {
00861     int iDataCntr;
00862     
00863     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00864     {
00865         X86_ASM (
00866             "movsd %1, %%xmm0\n\t" \
00867             "mulsd %2, %%xmm0\n\t" \
00868             "movsd %%xmm0, %0\n\t"
00869             : "=m" (dpDest[iDataCntr])
00870             : "m" (dpSrc1[iDataCntr]),
00871               "m" (dpSrc2[iDataCntr])
00872             : "xmm0", "memory");
00873     }
00874 }
00875 
00876 
00877 void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00878 {
00879     int iDataCntr;
00880     stpm64 m64pDest = (stpm64) fpDest;
00881     
00882     X86_ASM (
00883         "movq %0, %%mm3\n\t"
00884         :
00885         : "m" (fpSrc[0])
00886         : "mm3", "memory");
00887     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00888     {
00889         X86_ASM (
00890             "movq %1, %%mm0\n\t" \
00891             "movq %%mm3, %%mm1\n\t" \
00892             "pswapd %%mm1, %%mm2\n\t" \
00893             "pfmul %%mm0, %%mm1\n\t" \
00894             "pfmul %%mm0, %%mm2\n\t" \
00895             "pfpnacc %%mm2, %%mm1\n\t"
00896             "movntq %%mm1, %0\n\t"
00897             : "=m" (m64pDest[iDataCntr])
00898             : "m0" (m64pDest[iDataCntr])
00899             : "mm0", "mm1", "mm2", "mm3", "memory");
00900     }
00901     X86_ASM (
00902         "femms\n\t" \
00903         "sfence\n\t");
00904 }
00905 
00906 
00907 void dsp_x86_sse_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00908 {
00909     int iDataCntr;
00910     int iDataCount;
00911     
00912     X86_ASM (
00913         "movss %0, %%xmm2\n\t" \
00914         "movss %1, %%xmm3\n\t"
00915         :
00916         : "m" (fpSrc[0]),
00917           "m" (fpSrc[1])
00918         : "xmm2", "xmm3", "memory");
00919     iDataCount = (iDataLength << 1);
00920     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
00921     {
00922         X86_ASM (
00923             "movss %2, %%xmm0\n\t" \
00924             "movss %%xmm0, %%xmm1\n\t" \
00925             "movss %3, %%xmm4\n\t" \
00926             \
00927             "mulss %%xmm2, %%xmm0\n\t" \
00928             "movss %%xmm4, %%xmm5\n\t" \
00929             "mulss %%xmm3, %%xmm5\n\t" \
00930             "subss %%xmm0, %%xmm5\n\t" \
00931             \
00932             "mulss %%xmm3, %%xmm1\n\t" \
00933             "movss %%xmm4, %%xmm5\n\t" \
00934             "mulss %%xmm2, %%xmm5\n\t" \
00935             "addss %%xmm5, %%xmm1\n\t" \
00936             \
00937             "movss %%xmm0, %0\n\t" \
00938             "movss %%xmm1, %1\n\t"
00939             : "=m" (fpDest[iDataCntr]),
00940               "=m" (fpDest[iDataCntr + 1])
00941             : "m0" (fpDest[iDataCntr]),
00942               "m1" (fpDest[iDataCntr + 1])
00943             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
00944     }
00945 }
00946 
00947 
00948 void dsp_x86_sse_cmul (double *dpDest, const double *dpSrc, int iDataLength)
00949 {
00950     int iDataCntr;
00951     int iDataCount;
00952     
00953     X86_ASM (
00954         "movsd %0, %%xmm2\n\t" \
00955         "movsd %1, %%xmm3\n\t"
00956         :
00957         : "m" (dpSrc[0]),
00958           "m" (dpSrc[1])
00959         : "xmm2", "xmm3", "memory");
00960     iDataCount = (iDataLength << 1);
00961     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
00962     {
00963         X86_ASM (
00964             "movsd %2, %%xmm0\n\t" \
00965             "movsd %%xmm0, %%xmm1\n\t" \
00966             "movsd %3, %%xmm4\n\t" \
00967             \
00968             "mulsd %%xmm2, %%xmm0\n\t" \
00969             "movsd %%xmm4, %%xmm5\n\t" \
00970             "mulsd %%xmm3, %%xmm5\n\t" \
00971             "subsd %%xmm0, %%xmm5\n\t" \
00972             \
00973             "mulsd %%xmm3, %%xmm1\n\t" \
00974             "movsd %%xmm4, %%xmm5\n\t" \
00975             "mulsd %%xmm2, %%xmm5\n\t" \
00976             "addsd %%xmm5, %%xmm1\n\t" \
00977             \
00978             "movsd %%xmm0, %0\n\t" \
00979             "movsd %%xmm1, %1\n\t"
00980             : "=m" (dpDest[iDataCntr]),
00981               "=m" (dpDest[iDataCntr + 1])
00982             : "m0" (dpDest[iDataCntr]),
00983               "m1" (dpDest[iDataCntr + 1])
00984             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
00985     }
00986 }
00987 
00988 
00989 void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
00990 {
00991     int iDataCntr;
00992     stpm64 m64pDest = (stpm64) fpDest;
00993     stpm64 m64pSrc = (stpm64) fpSrc;
00994     
00995     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00996     {
00997         X86_ASM (
00998             "movq %1, %%mm0\n\t" \
00999             "movq %2, %%mm1\n\t" \
01000             "pswapd %%mm1, %%mm2\n\t" \
01001             "pfmul %%mm0, %%mm1\n\t" \
01002             "pfmul %%mm0, %%mm2\n\t" \
01003             "pfpnacc %%mm2, %%mm1\n\t"
01004             "movntq %%mm1, %0\n\t"
01005             : "=m" (m64pDest[iDataCntr])
01006             : "m0" (m64pDest[iDataCntr]),
01007               "m" (m64pSrc[iDataCntr])
01008             : "mm0", "mm1", "mm2", "memory");
01009     }
01010     X86_ASM (
01011         "femms\n\t" \
01012         "sfence\n\t");
01013 }
01014 
01015 
01016 void dsp_x86_sse_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
01017 {
01018     int iDataCntr;
01019     int iDataCount;
01020     
01021     iDataCount = (iDataLength << 1);
01022     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01023     {
01024         X86_ASM (
01025             "movss %4, %%xmm2\n\t" \
01026             "movss %5, %%xmm3\n\t" \
01027             \
01028             "movss %2, %%xmm0\n\t" \
01029             "movss %%xmm0, %%xmm1\n\t" \
01030             "movss %3, %%xmm4\n\t" \
01031             \
01032             "mulss %%xmm2, %%xmm0\n\t" \
01033             "movss %%xmm4, %%xmm5\n\t" \
01034             "mulss %%xmm3, %%xmm5\n\t" \
01035             "subss %%xmm0, %%xmm5\n\t" \
01036             \
01037             "mulss %%xmm3, %%xmm1\n\t" \
01038             "movss %%xmm4, %%xmm5\n\t" \
01039             "mulss %%xmm2, %%xmm5\n\t" \
01040             "addss %%xmm5, %%xmm1\n\t" \
01041             \
01042             "movss %%xmm0, %0\n\t" \
01043             "movss %%xmm1, %1\n\t"
01044             : "=m" (fpDest[iDataCntr]),
01045               "=m" (fpDest[iDataCntr + 1])
01046             : "m0" (fpDest[iDataCntr]),
01047               "m1" (fpDest[iDataCntr + 1]),
01048               "m" (fpSrc[iDataCntr]),
01049               "m" (fpSrc[iDataCntr + 1])
01050             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01051     }
01052 }
01053 
01054 
01055 void dsp_x86_sse_cmul2 (double *dpDest, const double *dpSrc, int iDataLength)
01056 {
01057     int iDataCntr;
01058     int iDataCount;
01059     
01060     iDataCount = (iDataLength << 1);
01061     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01062     {
01063         X86_ASM (
01064             "movsd %4, %%xmm2\n\t" \
01065             "movsd %5, %%xmm3\n\t" \
01066             \
01067             "movsd %2, %%xmm0\n\t" \
01068             "movsd %%xmm0, %%xmm1\n\t" \
01069             "movsd %3, %%xmm4\n\t" \
01070             \
01071             "mulsd %%xmm2, %%xmm0\n\t" \
01072             "movsd %%xmm4, %%xmm5\n\t" \
01073             "mulsd %%xmm3, %%xmm5\n\t" \
01074             "subsd %%xmm0, %%xmm5\n\t" \
01075             \
01076             "mulsd %%xmm3, %%xmm1\n\t" \
01077             "movsd %%xmm4, %%xmm5\n\t" \
01078             "mulsd %%xmm2, %%xmm5\n\t" \
01079             "addsd %%xmm5, %%xmm1\n\t" \
01080             \
01081             "movsd %%xmm0, %0\n\t" \
01082             "movsd %%xmm1, %1\n\t"
01083             : "=m" (dpDest[iDataCntr]),
01084               "=m" (dpDest[iDataCntr + 1])
01085             : "m0" (dpDest[iDataCntr]),
01086               "m1" (dpDest[iDataCntr + 1]),
01087               "m" (dpSrc[iDataCntr]),
01088               "m" (dpSrc[iDataCntr + 1])
01089             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01090     }
01091 }
01092 
01093 
01094 void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1, 
01095     const float *fpSrc2, int iDataLength)
01096 {
01097     int iDataCntr;
01098     stpm64 m64pDest = (stpm64) fpDest;
01099     stpm64 m64pSrc1 = (stpm64) fpSrc1;
01100     stpm64 m64pSrc2 = (stpm64) fpSrc2;
01101     
01102     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01103     {
01104         X86_ASM (
01105             "movq %1, %%mm0\n\t" \
01106             "movq %2, %%mm1\n\t" \
01107             "pswapd %%mm1, %%mm2\n\t" \
01108             "pfmul %%mm0, %%mm1\n\t" \
01109             "pfmul %%mm0, %%mm2\n\t" \
01110             "pfpnacc %%mm2, %%mm1\n\t"
01111             "movntq %%mm1, %0\n\t"
01112             : "=m" (m64pDest[iDataCntr])
01113             : "m" (m64pSrc1[iDataCntr]),
01114               "m" (m64pSrc2[iDataCntr])
01115             : "mm0", "mm1", "mm2", "memory");
01116     }
01117     X86_ASM (
01118         "femms\n\t" \
01119         "sfence\n\t");
01120 }
01121 
01122 
01123 void dsp_x86_sse_cmul3f (float *fpDest, const float *fpSrc1, 
01124     const float *fpSrc2, int iDataLength)
01125 {
01126     int iDataCntr;
01127     int iDataCount;
01128     
01129     iDataCount = (iDataLength << 1);
01130     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01131     {
01132         X86_ASM (
01133             "movss %4, %%xmm2\n\t" \
01134             "movss %5, %%xmm3\n\t" \
01135             \
01136             "movss %2, %%xmm0\n\t" \
01137             "movss %%xmm0, %%xmm1\n\t" \
01138             "movss %3, %%xmm4\n\t" \
01139             \
01140             "mulss %%xmm2, %%xmm0\n\t" \
01141             "movss %%xmm4, %%xmm5\n\t" \
01142             "mulss %%xmm3, %%xmm5\n\t" \
01143             "subss %%xmm0, %%xmm5\n\t" \
01144             \
01145             "mulss %%xmm3, %%xmm1\n\t" \
01146             "movss %%xmm4, %%xmm5\n\t" \
01147             "mulss %%xmm2, %%xmm5\n\t" \
01148             "addss %%xmm5, %%xmm1\n\t" \
01149             \
01150             "movss %%xmm0, %0\n\t" \
01151             "movss %%xmm1, %1\n\t"
01152             : "=m" (fpDest[iDataCntr]),
01153               "=m" (fpDest[iDataCntr + 1])
01154             : "m" (fpSrc1[iDataCntr]),
01155               "m" (fpSrc1[iDataCntr + 1]),
01156               "m" (fpSrc2[iDataCntr]),
01157               "m" (fpSrc2[iDataCntr + 1])
01158             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01159     }
01160 }
01161 
01162 
01163 void dsp_x86_sse_cmul3 (double *dpDest, const double *dpSrc1, 
01164     const double *dpSrc2, int iDataLength)
01165 {
01166     int iDataCntr;
01167     int iDataCount;
01168     
01169     iDataCount = (iDataLength << 1);
01170     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01171     {
01172         X86_ASM (
01173             "movsd %4, %%xmm2\n\t" \
01174             "movsd %5, %%xmm3\n\t" \
01175             \
01176             "movsd %2, %%xmm0\n\t" \
01177             "movsd %%xmm0, %%xmm1\n\t" \
01178             "movsd %3, %%xmm4\n\t" \
01179             \
01180             "mulsd %%xmm2, %%xmm0\n\t" \
01181             "movsd %%xmm4, %%xmm5\n\t" \
01182             "mulsd %%xmm3, %%xmm5\n\t" \
01183             "subsd %%xmm0, %%xmm5\n\t" \
01184             \
01185             "mulsd %%xmm3, %%xmm1\n\t" \
01186             "movsd %%xmm4, %%xmm5\n\t" \
01187             "mulsd %%xmm2, %%xmm5\n\t" \
01188             "addsd %%xmm5, %%xmm1\n\t" \
01189             \
01190             "movsd %%xmm0, %0\n\t" \
01191             "movsd %%xmm1, %1\n\t"
01192             : "=m" (dpDest[iDataCntr]),
01193               "=m" (dpDest[iDataCntr + 1])
01194             : "m" (dpSrc1[iDataCntr]),
01195               "m" (dpSrc1[iDataCntr + 1]),
01196               "m" (dpSrc2[iDataCntr]),
01197               "m" (dpSrc2[iDataCntr + 1])
01198             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01199     }
01200 }
01201 
01202 
01203 void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
01204 {
01205     int iDataCntr;
01206     int iDataCount;
01207     stpm64 m64pVect = (stpm64) fpVect;
01208     stm64 m64Mul;
01209     stm64 m64Add;
01210 
01211     m64Mul.f[0] = m64Mul.f[1] = fMul;
01212     m64Add.f[0] = m64Add.f[1] = fAdd;
01213     iDataCount = (iDataLength >> 1);
01214     X86_ASM (
01215         "movq %0, %%mm1\n\t" \
01216         "movq %1, %%mm2\n\t"
01217         :
01218         : "m" (m64Mul),
01219           "m" (m64Add)
01220         : "mm1", "mm2", "memory");
01221     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01222     {
01223         X86_ASM (
01224             "movq %1, %%mm0\n\t" \
01225             "pfmul %%mm1, %%mm0\n\t" \
01226             "pfadd %%mm2, %%mm0\n\t" \
01227             "movntq %%mm0, %0\n\t"
01228             : "=m" (m64pVect[iDataCntr])
01229             : "m0" (m64pVect[iDataCntr])
01230             : "mm0", "mm1", "mm2", "memory");
01231     }
01232     if (iDataLength & 0x1)
01233     {
01234         X86_ASM (
01235             "movd %1, %%mm0\n\t" \
01236             "pfmul %%mm1, %%mm0\n\t" \
01237             "pfadd %%mm2, %%mm0\n\t" \
01238             "movd %%mm0, %0\n\t"
01239             : "=m" (fpVect[iDataLength - 1])
01240             : "m0" (fpVect[iDataLength - 1])
01241             : "mm0", "mm1", "mm2", "memory");
01242     }
01243     X86_ASM (
01244         "femms\n\t" \
01245         "sfence\n\t");
01246 }
01247 
01248 
01249 void dsp_x86_sse_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
01250 {
01251     int iDataCntr;
01252     
01253     X86_ASM (
01254         "movss %0, %%xmm1\n\t" \
01255         "movss %1, %%xmm2\n\t"
01256         :
01257         : "m" (fMul),
01258           "m" (fAdd)
01259         : "xmm1", "xmm2", "memory");
01260     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01261     {
01262         X86_ASM (
01263             "movss %1, %%xmm0\n\t" \
01264             "mulss %%xmm1, %%xmm0\n\t" \
01265             "addss %%xmm2, %%xmm0\n\t" \
01266             "movss %%xmm0, %0\n\t"
01267             : "=m" (fpVect[iDataCntr])
01268             : "m0" (fpVect[iDataCntr])
01269             : "xmm0", "xmm1", "xmm2", "memory");
01270     }
01271 }
01272 
01273 
01274 void dsp_x86_sse_ma (double *dpVect, double dMul, double dAdd, int iDataLength)
01275 {
01276     int iDataCntr;
01277     
01278     X86_ASM (
01279         "movsd %0, %%xmm1\n\t" \
01280         "movsd %1, %%xmm2\n\t"
01281         :
01282         : "m" (dMul),
01283           "m" (dAdd)
01284         : "xmm1", "xmm2", "memory");
01285     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01286     {
01287         X86_ASM (
01288             "movsd %1, %%xmm0\n\t" \
01289             "mulsd %%xmm1, %%xmm0\n\t" \
01290             "addsd %%xmm2, %%xmm0\n\t" \
01291             "movsd %%xmm0, %0\n\t"
01292             : "=m" (dpVect[iDataCntr])
01293             : "m0" (dpVect[iDataCntr])
01294             : "xmm0", "xmm1", "xmm2", "memory");
01295     }
01296 }
01297 
01298 
01299 void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
01300     float fMul, float fAdd, int iDataLength)
01301 {
01302     int iDataCntr;
01303     int iDataCount;
01304     stpm64 m64pDest = (stpm64) fpDest;
01305     stpm64 m64pSrc = (stpm64) fpSrc;
01306     stm64 m64Mul;
01307     stm64 m64Add;
01308 
01309     m64Mul.f[0] = m64Mul.f[1] = fMul;
01310     m64Add.f[0] = m64Add.f[1] = fAdd;
01311     iDataCount = (iDataLength >> 1);
01312     X86_ASM (
01313         "movq %0, %%mm1\n\t" \
01314         "movq %1, %%mm2\n\t"
01315         :
01316         : "m" (m64Mul),
01317           "m" (m64Add)
01318         : "mm1", "mm2", "memory");
01319     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01320     {
01321         X86_ASM (
01322             "movq %1, %%mm0\n\t" \
01323             "pfmul %%mm1, %%mm0\n\t" \
01324             "pfadd %%mm2, %%mm0\n\t" \
01325             "movntq %%mm0, %0\n\t"
01326             : "=m" (m64pDest[iDataCntr])
01327             : "m" (m64pSrc[iDataCntr])
01328             : "mm0", "mm1", "mm2", "memory");
01329     }
01330     if (iDataLength & 0x1)
01331     {
01332         X86_ASM (
01333             "movd %1, %%mm0\n\t" \
01334             "pfmul %%mm1, %%mm0\n\t" \
01335             "pfadd %%mm2, %%mm0\n\t" \
01336             "movd %%mm0, %0\n\t"
01337             : "=m" (fpDest[iDataLength - 1])
01338             : "m" (fpSrc[iDataLength - 1])
01339             : "mm0", "mm1", "mm2", "memory");
01340     }
01341     X86_ASM (
01342         "femms\n\t" \
01343         "sfence\n\t");
01344 }
01345 
01346 
01347 void dsp_x86_sse_ma2f (float *fpDest, const float *fpSrc, 
01348     float fMul, float fAdd, int iDataLength)
01349 {
01350     int iDataCntr;
01351     
01352     X86_ASM (
01353         "movss %0, %%xmm1\n\t" \
01354         "movss %1, %%xmm2\n\t"
01355         :
01356         : "m" (fMul),
01357           "m" (fAdd)
01358         : "xmm1", "xmm2", "memory");
01359     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01360     {
01361         X86_ASM (
01362             "movss %1, %%xmm0\n\t" \
01363             "mulss %%xmm1, %%xmm0\n\t" \
01364             "addss %%xmm2, %%xmm0\n\t" \
01365             "movss %%xmm0, %0\n\t"
01366             : "=m" (fpDest[iDataCntr])
01367             : "m" (fpSrc[iDataCntr])
01368             : "xmm0", "xmm1", "xmm2", "memory");
01369     }
01370 }
01371 
01372 
01373 void dsp_x86_sse_ma2 (double *dpDest, const double *dpSrc, 
01374     double dMul, double dAdd, int iDataLength)
01375 {
01376     int iDataCntr;
01377     
01378     X86_ASM (
01379         "movsd %0, %%xmm1\n\t" \
01380         "movsd %1, %%xmm2\n\t"
01381         :
01382         : "m" (dMul),
01383           "m" (dAdd)
01384         : "xmm1", "xmm2", "memory");
01385     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01386     {
01387         X86_ASM (
01388             "movsd %1, %%xmm0\n\t" \
01389             "mulsd %%xmm1, %%xmm0\n\t" \
01390             "addsd %%xmm2, %%xmm0\n\t" \
01391             "movsd %%xmm0, %0\n\t"
01392             : "=m" (dpDest[iDataCntr])
01393             : "m" (dpSrc[iDataCntr])
01394             : "xmm0", "xmm1", "xmm2", "memory");
01395     }
01396 }
01397 
01398 
01399 void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
01400 {
01401     int iDataCntr;
01402     int iDataCount;
01403     stpm64 m64pVect = (stpm64) fpVect;
01404     stm64 m64Add;
01405     stm64 m64Mul;
01406 
01407     m64Add.f[0] = m64Add.f[1] = fAdd;
01408     m64Mul.f[0] = m64Mul.f[1] = fMul;
01409     iDataCount = (iDataLength >> 1);
01410     X86_ASM (
01411         "movq %0, %%mm1\n\t" \
01412         "movq %1, %%mm2\n\t"
01413         :
01414         : "m" (m64Add),
01415           "m" (m64Mul)
01416         : "mm1", "mm2", "memory");
01417     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01418     {
01419         X86_ASM (
01420             "movq %1, %%mm0\n\t" \
01421             "pfadd %%mm1, %%mm0\n\t" \
01422             "pfmul %%mm2, %%mm0\n\t" \
01423             "movntq %%mm0, %0\n\t"
01424             : "=m" (m64pVect[iDataCntr])
01425             : "m0" (m64pVect[iDataCntr])
01426             : "mm0", "mm1", "mm2", "memory");
01427     }
01428     if (iDataLength & 0x1)
01429     {
01430         X86_ASM (
01431             "movd %1, %%mm0\n\t" \
01432             "pfadd %%mm1, %%mm0\n\t" \
01433             "pfmul %%mm2, %%mm0\n\t" \
01434             "movd %%mm0, %0\n\t"
01435             : "=m" (fpVect[iDataLength - 1])
01436             : "m0" (fpVect[iDataLength - 1])
01437             : "mm0", "mm1", "mm2", "memory");
01438     }
01439     X86_ASM (
01440         "femms\n\t" \
01441         "sfence\n\t");
01442 }
01443 
01444 
01445 void dsp_x86_sse_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
01446 {
01447     int iDataCntr;
01448 
01449     X86_ASM (
01450         "movss %0, %%xmm1\n\t" \
01451         "movss %1, %%xmm2\n\t"
01452         :
01453         : "m" (fAdd),
01454           "m" (fMul)
01455         : "xmm1", "xmm2", "memory");
01456     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01457     {
01458         X86_ASM (
01459             "movss %1, %%xmm0\n\t" \
01460             "addss %%xmm1, %%xmm0\n\t" \
01461             "mulss %%xmm2, %%xmm0\n\t" \
01462             "movss %%xmm0, %0\n\t"
01463             : "=m" (fpVect[iDataCntr])
01464             : "m0" (fpVect[iDataCntr])
01465             : "xmm0", "xmm1", "xmm2", "memory");
01466     }
01467 }
01468 
01469 
01470 float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2, 
01471     int iDataLength)
01472 {
01473     int iDataCntr;
01474     int iDataCount;
01475     float fRes;
01476     stpm64 m64pSrc1 = (stpm64) fpSrc1;
01477     stpm64 m64pSrc2 = (stpm64) fpSrc2;
01478 
01479     iDataCount = (iDataLength >> 1);
01480     X86_ASM (
01481         "pxor %%mm0, %%mm0\n\t"
01482         :
01483         :
01484         : "mm0");
01485     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01486     {
01487         X86_ASM (
01488             "movq %0, %%mm1\n\t" \
01489             "movq %1, %%mm2\n\t" \
01490             "pfmul %%mm2, %%mm1\n\t" \
01491             "pfacc %%mm1, %%mm0\n\t"
01492             :
01493             : "m" (m64pSrc1[iDataCntr]),
01494               "m" (m64pSrc2[iDataCntr])
01495             : "mm0", "mm1", "mm2", "memory");
01496     }
01497     if (iDataLength & 0x1)
01498     {
01499         X86_ASM (
01500             "movd %0, %%mm1\n\t" \
01501             "movd %1, %%mm2\n\t" \
01502             "pfmul %%mm2, %%mm1\n\t" \
01503             "pfacc %%mm1, %%mm0\n\t"
01504             :
01505             : "m" (fpSrc1[iDataLength - 1]),
01506               "m" (fpSrc2[iDataLength - 1])
01507             : "mm0", "mm1", "mm2", "memory");
01508     }
01509     X86_ASM (
01510         "pfacc %%mm0, %%mm0\n\t" \
01511         "movd %%mm0, %0\n\t"
01512         : "=m" (fRes)
01513         :
01514         : "mm0", "memory");
01515     X86_ASM ("femms\n\t");
01516 
01517     return fRes;
01518 }
01519 
01520 
01521 float dsp_x86_sse_macf (const float *fpSrc1, const float *fpSrc2,
01522     int iDataLength)
01523 {
01524     int iDataCntr;
01525     float fRes;
01526     
01527     X86_ASM (
01528         "xorps %%xmm0, %%xmm0\n\t"
01529         :
01530         :
01531         : "xmm0");
01532     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01533     {
01534         X86_ASM (
01535             "movss %0, %%xmm1\n\t" \
01536             "mulss %1, %%xmm1\n\t" \
01537             "addss %%xmm1, %%xmm0\n\t"
01538             :
01539             : "m" (fpSrc1[iDataCntr]),
01540               "m" (fpSrc2[iDataCntr])
01541             : "xmm0", "xmm1", "xmm2", "memory");
01542     }
01543     X86_ASM (
01544         "movss %%xmm0, %0\n\t"
01545         : "=m" (fRes)
01546         :
01547         : "xmm0");
01548 
01549     return fRes;
01550 }
01551 
01552 
01553 double dsp_x86_sse_mac (const double *dpSrc1, const double *dpSrc2,
01554     int iDataLength)
01555 {
01556     int iDataCntr;
01557     double dRes;
01558     
01559     X86_ASM (
01560         "xorpd %%xmm0, %%xmm0\n\t"
01561         :
01562         :
01563         : "xmm0");
01564     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01565     {
01566         X86_ASM (
01567             "movsd %0, %%xmm1\n\t" \
01568             "mulsd %1, %%xmm1\n\t" \
01569             "addsd %%xmm1, %%xmm0\n\t"
01570             :
01571             : "m" (dpSrc1[iDataCntr]),
01572               "m" (dpSrc2[iDataCntr])
01573             : "xmm0", "xmm1", "xmm2", "memory");
01574     }
01575     X86_ASM (
01576         "movsd %%xmm0, %0\n\t"
01577         : "=m" (dRes)
01578         :
01579         : "xmm0");
01580 
01581     return dRes;
01582 }
01583 
01584 
01585 void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc, 
01586     int iDataLength)
01587 {
01588     int iDataCntr;
01589     int iDataCount;
01590     stm64 m64Min;
01591     stm64 m64Max;
01592     stpm64 m64pSrc = (stpm64) fpSrc;
01593     
01594     m64Min.f[0] = m64Min.f[1] = FLT_MAX;
01595     m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
01596     iDataCount = (iDataLength >> 1);
01597     X86_ASM (
01598         "movq %0, %%mm1\n\t" \
01599         "movq %1, %%mm2\n\t"
01600         :
01601         : "m" (m64Min),
01602           "m" (m64Max)
01603         : "mm1", "mm2", "memory");
01604     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01605     {
01606         X86_ASM (
01607             "movq %0, %%mm0\n\t" \
01608             "pfmin %%mm0, %%mm1\n\t" \
01609             "pfmax %%mm0, %%mm2\n\t"
01610             :
01611             : "m" (m64pSrc[iDataCntr])
01612             : "mm0", "mm1", "mm2", "memory");
01613     }
01614     if (iDataLength & 0x1)
01615     {
01616         X86_ASM (
01617             "movd %0, %%mm0\n\t" \
01618             "pfmin %%mm0, %%mm1\n\t" \
01619             "pfmax %%mm0, %%mm2\n\t"
01620             :
01621             : "m" (fpSrc[iDataLength - 1])
01622             : "mm0", "mm1", "mm2", "memory");
01623     }
01624     X86_ASM (
01625         "pswapd %%mm1, %%mm3\n\t" \
01626         "pfmin %%mm3, %%mm1\n\t" \
01627         "pswapd %%mm2, %%mm3\n\t" \
01628         "pfmax %%mm3, %%mm2\n\t" \
01629         "movd %%mm1, %0\n\t" \
01630         "movd %%mm2, %1\n\t"
01631         : "=m" (*fpMin),
01632           "=m" (*fpMax)
01633         :
01634         : "mm1", "mm2", "mm3", "memory");
01635     X86_ASM ("femms\n\t");
01636 }
01637 
01638 
01639 void dsp_x86_sse_minmaxf (float *fpMin, float *fpMax, const float *fpSrc, 
01640     int iDataLength)
01641 {
01642     int iDataCntr;
01643 
01644     *fpMin = FLT_MAX;
01645     *fpMax = -FLT_MAX;
01646     X86_ASM (
01647         "movss %0, %%xmm0\n\t" \
01648         "movss %1, %%xmm1\n\t"
01649         :
01650         : "m" (*fpMin),
01651           "m" (*fpMax)
01652         : "xmm0", "xmm1", "memory");
01653     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01654     {
01655         X86_ASM (
01656             "movss %0, %%xmm2\n\t" \
01657             "minss %%xmm2, %%xmm0\n\t" \
01658             "maxss %%xmm2, %%xmm1\n\t"
01659             :
01660             : "m" (fpSrc[iDataCntr])
01661             : "xmm0", "xmm1", "xmm2", "memory");
01662     }
01663     X86_ASM (
01664         "movss %%xmm0, %0\n\t" \
01665         "movss %%xmm1, %1\n\t"
01666         : "=m" (*fpMin),
01667           "=m" (*fpMax)
01668         :
01669         : "xmm0", "xmm1", "memory");
01670 }
01671 
01672 
01673 void dsp_x86_sse_minmax (double *dpMin, double *dpMax, const double *dpSrc, 
01674     int iDataLength)
01675 {
01676     int iDataCntr;
01677 
01678     *dpMin = FLT_MAX;
01679     *dpMax = -FLT_MAX;
01680     X86_ASM (
01681         "movsd %0, %%xmm0\n\t" \
01682         "movsd %1, %%xmm1\n\t"
01683         :
01684         : "m" (*dpMin),
01685           "m" (*dpMax)
01686         : "xmm0", "xmm1", "memory");
01687     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01688     {
01689         X86_ASM (
01690             "movsd %0, %%xmm2\n\t" \
01691             "minsd %%xmm2, %%xmm0\n\t" \
01692             "maxsd %%xmm2, %%xmm1\n\t"
01693             :
01694             : "m" (dpSrc[iDataCntr])
01695             : "xmm0", "xmm1", "xmm2", "memory");
01696     }
01697     X86_ASM (
01698         "movss %%xmm0, %0\n\t" \
01699         "movss %%xmm1, %1\n\t"
01700         : "=m" (*dpMin),
01701           "=m" (*dpMax)
01702         :
01703         : "xmm0", "xmm1", "memory");
01704 }
01705 
01706 
01707 float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01708     int iDataLength)
01709 {
01710     int iDataCntr;
01711     int iDataCount;
01712     float fRes;
01713     stpm64 m64pSrc1 = (stpm64) fpSrc1;
01714     stpm64 m64pSrc2 = (stpm64) fpSrc2;
01715     
01716     iDataCount = (iDataLength >> 1);
01717     X86_ASM (
01718         "pxor %%mm3, %%mm3\n\t" \
01719         "pxor %%mm4, %%mm4\n\t" \
01720         "pxor %%mm5, %%mm5\n\t"
01721         :
01722         :
01723         : "mm3", "mm4", "mm5");
01724     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01725     {
01726         X86_ASM (
01727             "movq %0, %%mm0\n\t" \
01728             "movq %1, %%mm1\n\t" \
01729             "movq %%mm1, %%mm2\n\t" \
01730             "pfmul %%mm0, %%mm2\n\t" \
01731             "pfacc %%mm2, %%mm5\n\t" \
01732             "pfmul %%mm0, %%mm0\n\t" \
01733             "pfacc %%mm0, %%mm3\n\t" \
01734             "pfmul %%mm1, %%mm1\n\t" \
01735             "pfacc %%mm1, %%mm4\n\t"
01736             :
01737             : "m" (m64pSrc1[iDataCntr]),
01738               "m" (m64pSrc2[iDataCntr])
01739             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01740     }
01741     if (iDataLength & 0x1)
01742     {
01743         X86_ASM (
01744             "movd %0, %%mm0\n\t" \
01745             "movd %1, %%mm1\n\t" \
01746             "movq %%mm1, %%mm2\n\t" \
01747             "pfmul %%mm0, %%mm2\n\t" \
01748             "pfacc %%mm2, %%mm5\n\t" \
01749             "pfmul %%mm0, %%mm0\n\t" \
01750             "pfacc %%mm0, %%mm3\n\t" \
01751             "pfmul %%mm1, %%mm1\n\t" \
01752             "pfacc %%mm1, %%mm4\n\t"
01753             :
01754             : "m" (fpSrc1[iDataLength - 1]),
01755               "m" (fpSrc2[iDataLength - 1])
01756             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01757     }
01758     X86_ASM (
01759         "pfacc %%mm3, %%mm3\n\t" \
01760         "pfacc %%mm4, %%mm4\n\t" \
01761         "pfacc %%mm5, %%mm5\n\t" \
01762         \
01763         "movd %1, %%mm6\n\t" \
01764         "pswapd %%mm6, %%mm7\n\t" \
01765         "paddd %%mm7, %%mm6\n\t" \
01766         "pi2fd %%mm6, %%mm7\n\t" \
01767         \
01768         "pfrcp %%mm7, %%mm6\n\t" \
01769         "pfrcpit1 %%mm6, %%mm7\n\t" \
01770         "pfrcpit2 %%mm6, %%mm7\n\t" \
01771         \
01772         "pfmul %%mm3, %%mm4\n\t" \
01773         \
01774         "movq %%mm4, %%mm0\n\t" \
01775         "pfrsqrt %%mm4, %%mm1\n\t" \
01776         "movq %%mm1, %%mm2\n\t" \
01777         "pfmul %%mm1, %%mm1\n\t" \
01778         "pfrsqit1 %%mm4, %%mm1\n\t" \
01779         "pfrcpit2 %%mm2, %%mm1\n\t" \
01780         "pfmul %%mm1, %%mm4\n\t" \
01781         \
01782         "pfmul %%mm6, %%mm4\n\t" \
01783         \
01784         "pfrcp %%mm4, %%mm0\n\t" \
01785         "pfrcpit1 %%mm0, %%mm4\n\t" \
01786         "pfrcpit2 %%mm0, %%mm4\n\t" \
01787         \
01788         "pfmul %%mm6, %%mm5\n\t" \
01789         "pfmul %%mm4, %%mm5\n\t" \
01790         "movd %%mm5, %0\n\t"
01791         : "=m" (fRes)
01792         : "m" (iDataLength)
01793         : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01794     X86_ASM ("femms\n\t");
01795 
01796     return fRes;
01797 }
01798 
01799 
01800 float dsp_x86_sse_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01801     int iDataLength)
01802 {
01803     int iDataCntr;
01804     float fScale;
01805     float fNormFact;
01806     float fProdSum;
01807     float fSqSum1;
01808     float fSqSum2;
01809     float fRes;
01810     
01811     X86_ASM (
01812         "xorps %%xmm0, %%xmm0\n\t" \
01813         "xorps %%xmm1, %%xmm1\n\t" \
01814         "xorps %%xmm2, %%xmm2\n\t"
01815         :
01816         :
01817         : "xmm0", "xmm1", "xmm2");
01818     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01819     {
01820         X86_ASM (
01821             "movss %3, %%xmm3\n\t" \
01822             "movss %4, %%xmm4\n\t" \
01823             \
01824             "movss %%xmm4, %%xmm5\n\t" \
01825             "mulss %%xmm3, %%xmm5\n\t" \
01826             "addss %%xmm5, %%xmm0\n\t" \
01827             \
01828             "movss %%xmm3, %%xmm5\n\t" \
01829             "mulss %%xmm3, %%xmm5\n\t" \
01830             "addss %%xmm5, %%xmm1\n\t" \
01831             \
01832             "movss %%xmm4, %%xmm5\n\t" \
01833             "mulss %%xmm4, %%xmm5\n\t" \
01834             "addss %%xmm5, %%xmm2\n\t" \
01835             \
01836             "movss %%xmm0, %0\n\t" \
01837             "movss %%xmm1, %1\n\t" \
01838             "movss %%xmm2, %2\n\t"
01839             : "=m" (fProdSum),
01840               "=m" (fSqSum1),
01841               "=m" (fSqSum2)
01842             : "m" (fpSrc1[iDataCntr]),
01843               "m" (fpSrc2[iDataCntr])
01844             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
01845     }
01846     fScale = 1.0F / iDataLength;
01847     fNormFact = sqrtf(fSqSum1 * fSqSum2) * fScale;
01848     fRes = (fProdSum * fScale) / fNormFact;
01849 
01850     return fRes;
01851 }
01852 
01853 
01854 double dsp_x86_sse_crosscorr (const double *dpSrc1, const double *dpSrc2,
01855     int iDataLength)
01856 {
01857     int iDataCntr;
01858     double dScale;
01859     double dNormFact;
01860     double dProdSum;
01861     double dSqSum1;
01862     double dSqSum2;
01863     double dRes;
01864     
01865     X86_ASM (
01866         "xorpd %%xmm0, %%xmm0\n\t" \
01867         "xorpd %%xmm1, %%xmm1\n\t" \
01868         "xorpd %%xmm2, %%xmm2\n\t"
01869         :
01870         :
01871         : "xmm0", "xmm1", "xmm2");
01872     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01873     {
01874         X86_ASM (
01875             "movsd %3, %%xmm3\n\t" \
01876             "movsd %4, %%xmm4\n\t" \
01877             \
01878             "movsd %%xmm4, %%xmm5\n\t" \
01879             "mulsd %%xmm3, %%xmm5\n\t" \
01880             "addsd %%xmm5, %%xmm0\n\t" \
01881             \
01882             "movsd %%xmm3, %%xmm5\n\t" \
01883             "mulsd %%xmm3, %%xmm5\n\t" \
01884             "addsd %%xmm5, %%xmm1\n\t" \
01885             \
01886             "movsd %%xmm4, %%xmm5\n\t" \
01887             "mulsd %%xmm4, %%xmm5\n\t" \
01888             "addsd %%xmm5, %%xmm2\n\t" \
01889             \
01890             "movsd %%xmm0, %0\n\t" \
01891             "movsd %%xmm1, %1\n\t" \
01892             "movsd %%xmm2, %2\n\t"
01893             : "=m" (dProdSum),
01894               "=m" (dSqSum1),
01895               "=m" (dSqSum2)
01896             : "m" (dpSrc1[iDataCntr]),
01897               "m" (dpSrc2[iDataCntr])
01898             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
01899     }
01900     dScale = 1.0 / iDataLength;
01901     dNormFact = sqrt(dSqSum1 * dSqSum2) * dScale;
01902     dRes = (dProdSum * dScale) / dNormFact;
01903 
01904     return dRes;
01905 }
01906 
01907 
01908 void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
01909     int iIntMax)
01910 {
01911     int iDataCntr;
01912     float fScale;
01913     
01914     X86_ASM (
01915         "movd %1, %%mm1\n\t" \
01916         "pswapd %%mm1, %%mm2\n\t" \
01917         "paddd %%mm2, %%mm1\n\t" \
01918         "pi2fd %%mm1, %%mm1\n\t" \
01919         "pfrcp %%mm1, %%mm2\n\t" \
01920         "pfrcpit1 %%mm2, %%mm1\n\t" \
01921         "pfrcpit2 %%mm2, %%mm1\n\t" \
01922         "movd %%mm1, %0\n\t"
01923         : "=m" (fScale)
01924         : "m" (iIntMax)
01925         : "mm1", "mm2", "memory");
01926     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01927     {
01928         X86_ASM (
01929             "movd %1, %%mm0\n\t" \
01930             "punpcklwd %%mm0, %%mm0\n\t" \
01931             "pi2fw %%mm0, %%mm0\n\t" \
01932             "pfmul %%mm1, %%mm0\n\t" \
01933             "movntq %%mm0, %0\n\t"
01934             : "=m" (fpDest[iDataCntr])
01935             : "m" (ipSrc[iDataCntr])
01936             : "mm0", "mm1", "memory");
01937     }
01938     X86_ASM (
01939         "femms\n\t" \
01940         "sfence\n\t");
01941     if ((iDataLength % 2) != 0)
01942     {
01943         fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01944     }
01945 }
01946 
01947 
01948 void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
01949     int iIntMax)
01950 {
01951     int iDataCntr;
01952     float fScale;
01953     
01954     X86_ASM (
01955         "movd %1, %%mm1\n\t" \
01956         "pswapd %%mm1, %%mm2\n\t" \
01957         "paddd %%mm2, %%mm1\n\t" \
01958         "pi2fd %%mm1, %%mm1\n\t" \
01959         "pfrcp %%mm1, %%mm2\n\t" \
01960         "pfrcpit1 %%mm2, %%mm1\n\t" \
01961         "pfrcpit2 %%mm2, %%mm1\n\t" \
01962         "movd %%mm1, %0\n\t"
01963         : "=m" (fScale)
01964         : "m" (iIntMax)
01965         : "mm1", "mm2", "memory");
01966     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01967     {
01968         X86_ASM (
01969             "movq %1, %%mm0\n\t" \
01970             "pi2fd %%mm0, %%mm0\n\t" \
01971             "pfmul %%mm1, %%mm0\n\t" \
01972             "movntq %%mm0, %0\n\t"
01973             : "=m" (fpDest[iDataCntr])
01974             : "m" (ipSrc[iDataCntr])
01975             : "mm0", "mm1", "memory");
01976     }
01977     X86_ASM (
01978         "femms\n\t" \
01979         "sfence\n\t");
01980     if ((iDataLength % 2) != 0)
01981     {
01982         fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01983     }
01984 }
01985 
01986 
01987 void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength, 
01988     const float *fpCoeff, int iCoeffLength)
01989 {
01990     int iSrcCntr;
01991     int iDestCntr;
01992     int iCoeffCntr;
01993     int iSrcCount;
01994     stpm64 m64pDest = (stpm64) fpDest;
01995 
01996     iDestCntr = 0;
01997     iSrcCount = iDataLength + iCoeffLength;
01998     for (iSrcCntr = iCoeffLength; 
01999         iSrcCntr < iSrcCount; 
02000         iSrcCntr += 2)
02001     {
02002         X86_ASM (
02003             "pxor %%mm0, %%mm0\n\t" 
02004             :
02005             :
02006             : "mm0");
02007         for (iCoeffCntr = 0; 
02008             iCoeffCntr < iCoeffLength;
02009             iCoeffCntr++)
02010         {
02011             X86_ASM (
02012                 "movq %0, %%mm1\n\t" \
02013                 "movd %1, %%mm2\n\t" \
02014                 "pswapd %%mm2, %%mm3\n\t" \
02015                 "pfadd %%mm3, %%mm2\n\t" \
02016                 "pfmul %%mm2, %%mm1\n\t" \
02017                 "pfadd %%mm1, %%mm0\n\t" 
02018                 :
02019                 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
02020                   "m" (fpCoeff[iCoeffCntr])
02021                 : "mm0", "mm1", "mm2", "mm3", "memory");
02022         }
02023         X86_ASM (
02024             "movntq %%mm0, %0\n\t"
02025             : "=m" (m64pDest[iDestCntr++])
02026             :
02027             : "mm0", "memory");
02028     }
02029     if (iDataLength & 0x1)
02030     {
02031         X86_ASM (
02032             "pxor %%mm0, %%mm0\n\t" 
02033             :
02034             :
02035             : "mm0");
02036         for (iCoeffCntr = 0; 
02037             iCoeffCntr < iCoeffLength;
02038             iCoeffCntr++)
02039         {
02040             X86_ASM (
02041                 "movd %0, %%mm1\n\t" \
02042                 "movd %1, %%mm2\n\t" \
02043                 "pfmul %%mm2, %%mm1\n\t" \
02044                 "pfadd %%mm1, %%mm0\n\t" 
02045                 :
02046                 : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
02047                   "m" (fpCoeff[iCoeffCntr])
02048                 : "mm0", "mm1", "mm2", "memory");
02049         }
02050         X86_ASM (
02051             "movd %%mm0, %0\n\t"
02052             : "=m" (fpDest[iDataLength - 1])
02053             :
02054             : "mm0", "memory");
02055     }
02056     X86_ASM (
02057         "femms\n\t" \
02058         "sfence\n\t");
02059 }
02060 
02061 
02062 void dsp_x86_sse_firf (float *fpDest, const float *fpSrc, int iDataLength, 
02063     const float *fpCoeff, int iCoeffLength)
02064 {
02065     int iDestCntr;
02066     int iSrcCntr;
02067     int iCoeffCntr;
02068     int iSrcCount;
02069 
02070     iDestCntr = 0;
02071     iSrcCount = iDataLength + iCoeffLength;
02072     for (iSrcCntr = iCoeffLength;
02073         iSrcCntr < iSrcCount;
02074         iSrcCntr++)
02075     {
02076         X86_ASM (
02077             "xorps %%xmm0, %%xmm0\n\t"
02078             :
02079             :
02080             : "xmm0");
02081         for (iCoeffCntr = 0;
02082             iCoeffCntr < iCoeffLength;
02083             iCoeffCntr++)
02084         {
02085             X86_ASM (
02086                 "movss %0, %%xmm1\n\t"
02087                 "mulss %1, %%xmm1\n\t"
02088                 "addss %%xmm1, %%xmm0\n\t"
02089                 :
02090                 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
02091                   "m" (fpCoeff[iCoeffCntr])
02092                 : "xmm0", "xmm1", "memory");
02093         }
02094         X86_ASM (
02095             "movss %%xmm0, %0\n\t"
02096             : "=m" (fpDest[iDestCntr++])
02097             :
02098             : "xmm0", "memory");
02099     }
02100 }
02101 
02102 
02103 void dsp_x86_sse_fir (double *dpDest, const double *dpSrc, int iDataLength, 
02104     const double *dpCoeff, int iCoeffLength)
02105 {
02106     int iDestCntr;
02107     int iSrcCntr;
02108     int iCoeffCntr;
02109     int iSrcCount;
02110 
02111     iDestCntr = 0;
02112     iSrcCount = iDataLength + iCoeffLength;
02113     for (iSrcCntr = iCoeffLength;
02114         iSrcCntr < iSrcCount;
02115         iSrcCntr++)
02116     {
02117         X86_ASM (
02118             "xorpd %%xmm0, %%xmm0\n\t"
02119             :
02120             :
02121             : "xmm0");
02122         for (iCoeffCntr = 0;
02123             iCoeffCntr < iCoeffLength;
02124             iCoeffCntr++)
02125         {
02126             X86_ASM (
02127                 "movsd %0, %%xmm1\n\t"
02128                 "mulsd %1, %%xmm1\n\t"
02129                 "addsd %%xmm1, %%xmm0\n\t"
02130                 :
02131                 : "m" (dpSrc[iSrcCntr - iCoeffCntr]),
02132                   "m" (dpCoeff[iCoeffCntr])
02133                 : "xmm0", "xmm1", "memory");
02134         }
02135         X86_ASM (
02136             "movsd %%xmm0, %0\n\t"
02137             : "=m" (dpDest[iDestCntr++])
02138             :
02139             : "xmm0", "memory");
02140     }
02141 }
02142 
02143 
02144 void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff, 
02145     float *fpX, float *fpY)
02146 {
02147     int iDataCntr;
02148     stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
02149     stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
02150     stpm64 m64pX = (stpm64) fpX;
02151     stpm64 m64pY = (stpm64) fpY;
02152 
02153     X86_ASM (
02154         "movq %0, %%mm0\n\t" \
02155         "pswapd %%mm0, %%mm2\n\t" \
02156         "movd %1, %%mm3\n\t" \
02157         "movq %2, %%mm0\n\t" \
02158         "pswapd %%mm0, %%mm4\n\t" \
02159         "movq %3, %%mm5\n\t" \
02160         "movq %4, %%mm7\n\t" \
02161         :
02162         : "m" (*m64pCoeff),
02163           "m" (fpCoeff[0]),
02164           "m" (*m64pCoeff2),
02165           "m" (*m64pX),
02166           "m" (*m64pY)
02167         : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
02168     for (iDataCntr = 0; 
02169         iDataCntr < iDataLength; 
02170         iDataCntr++)
02171     {
02172         X86_ASM (
02173             "pxor %%mm0, %%mm0\n\t" \
02174             "movd %1, %%mm6\n\t" \
02175             "movq %%mm5, %%mm1\n\t" \
02176             "pfmul %%mm2, %%mm1\n\t" \
02177             "pfacc %%mm1, %%mm0\n\t" \
02178             "movq %%mm6, %%mm1\n\t" \
02179             "pfmul %%mm3, %%mm1\n\t" \
02180             "pfacc %%mm1, %%mm0\n\t" \
02181             "movq %%mm7, %%mm1\n\t" \
02182             "pfmul %%mm4, %%mm1\n\t" \
02183             "pfacc %%mm1, %%mm0\n\t" \
02184             "pfacc %%mm0, %%mm0\n\t" \
02185             \
02186             "pswapd %%mm7, %%mm1\n\t" \
02187             "movq %%mm1, %%mm7\n\t" \
02188             "punpckldq %%mm0, %%mm7\n\t" \
02189             \
02190             "pswapd %%mm5, %%mm1\n\t" \
02191             "movq %%mm1, %%mm5\n\t" \
02192             "movq %%mm6, %%mm1\n\t" \
02193             "punpckldq %%mm1, %%mm5\n\t" \
02194             \
02195             "movd %%mm0, %0\n\t"
02196             : "=m" (fpVect[iDataCntr])
02197             : "m0" (fpVect[iDataCntr])
02198             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
02199     }
02200     X86_ASM (
02201         "movq %%mm5, %0\n\t" \
02202         "movd %%mm6, %1\n\t" \
02203         "movq %%mm7, %2\n\t"
02204         : "=m" (*m64pX),
02205           "=m" (fpX[2]),
02206           "=m" (*m64pY)
02207         :
02208         : "mm5", "mm6", "mm7", "memory");
02209     X86_ASM ("femms\n\t");
02210 }
02211 
02212 
02213 void dsp_x86_sse_iirf (float *fpVect, int iDataLength, const float *fpCoeff, 
02214     float *fpX, float *fpY)
02215 {
02216     int iDataCntr;
02217     
02218     X86_ASM (
02219         "movss %0, %%xmm1\n\t" \
02220         "movss %1, %%xmm2\n\t" \
02221         "movss %2, %%xmm3\n\t" \
02222         "movss %3, %%xmm4\n\t" \
02223         "prefetchnta %4\n\t"
02224         :
02225         : "m" (fpX[1]),
02226           "m" (fpX[2]),
02227           "m" (fpY[0]),
02228           "m" (fpY[1]),
02229           "m" (fpCoeff[0])
02230         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02231     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02232     {
02233         X86_ASM (
02234             "movss %%xmm1, %%xmm0\n\t" \
02235             "movss %%xmm2, %%xmm1\n\t" \
02236             "movss %1, %%xmm2\n\t" \
02237             \
02238             "movss %2, %%xmm5\n\t" \
02239             "mulss %%xmm2, %%xmm5\n\t" \
02240             "movss %3, %%xmm6\n\t" \
02241             "mulss %%xmm1, %%xmm6\n\t" \
02242             "addss %%xmm6, %%xmm5\n\t" \
02243             "movss %4, %%xmm6\n\t" \
02244             "mulss %%xmm0, %%xmm6\n\t" \
02245             "addss %%xmm6, %%xmm5\n\t" \
02246             \
02247             "movss %5, %%xmm6\n\t" \
02248             "mulss %%xmm4, %%xmm6\n\t" \
02249             "movss %6, %%xmm7\n\t" \
02250             "mulss %%xmm3, %%xmm7\n\t" \
02251             "addss %%xmm7, %%xmm6\n\t" \
02252             \
02253             "addss %%xmm5, %%xmm6\n\t" \
02254             "movss %%xmm4, %%xmm3\n\t" \
02255             "movss %%xmm6, %%xmm4\n\t" \
02256             \
02257             "movss %%xmm6, %0\n\t"
02258             : "=m" (fpVect[iDataCntr])
02259             : "m0" (fpVect[iDataCntr]),
02260               "m" (fpCoeff[0]),
02261               "m" (fpCoeff[1]),
02262               "m" (fpCoeff[2]),
02263               "m" (fpCoeff[3]),
02264               "m" (fpCoeff[4])
02265             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02266               "memory");
02267     }
02268     X86_ASM (
02269         "movss %%xmm0, %0\n\t" \
02270         "movss %%xmm1, %1\n\t" \
02271         "movss %%xmm2, %2\n\t" \
02272         "movss %%xmm3, %3\n\t" \
02273         "movss %%xmm4, %4\n\t"
02274         : "=m" (fpX[0]),
02275           "=m" (fpX[1]),
02276           "=m" (fpX[2]),
02277           "=m" (fpY[0]),
02278           "=m" (fpY[1])
02279         :
02280         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02281 }
02282 
02283 
02284 void dsp_x86_sse_iir (double *dpVect, int iDataLength, const double *dpCoeff, 
02285     double *dpX, double *dpY)
02286 {
02287     int iDataCntr;
02288     
02289     X86_ASM (
02290         "movsd %0, %%xmm1\n\t" \
02291         "movsd %1, %%xmm2\n\t" \
02292         "movsd %2, %%xmm3\n\t" \
02293         "movsd %3, %%xmm4\n\t" \
02294         "prefetchnta %4\n\t" \
02295         "prefetchnta %5\n\t"
02296         :
02297         : "m" (dpX[1]),
02298           "m" (dpX[2]),
02299           "m" (dpY[0]),
02300           "m" (dpY[1]),
02301           "m" (dpCoeff[0]),
02302           "m" (dpCoeff[3])
02303         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02304     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02305     {
02306         X86_ASM (
02307             "movsd %%xmm1, %%xmm0\n\t" \
02308             "movsd %%xmm2, %%xmm1\n\t" \
02309             "movsd %1, %%xmm2\n\t" \
02310             \
02311             "movsd %2, %%xmm5\n\t" \
02312             "mulsd %%xmm2, %%xmm5\n\t" \
02313             "movsd %3, %%xmm6\n\t" \
02314             "mulsd %%xmm1, %%xmm6\n\t" \
02315             "addsd %%xmm6, %%xmm5\n\t" \
02316             "movsd %4, %%xmm6\n\t" \
02317             "mulsd %%xmm0, %%xmm6\n\t" \
02318             "addsd %%xmm6, %%xmm5\n\t" \
02319             \
02320             "movsd %5, %%xmm6\n\t" \
02321             "mulsd %%xmm4, %%xmm6\n\t" \
02322             "movsd %6, %%xmm7\n\t" \
02323             "mulsd %%xmm3, %%xmm7\n\t" \
02324             "addsd %%xmm7, %%xmm6\n\t" \
02325             \
02326             "addsd %%xmm5, %%xmm6\n\t" \
02327             "movsd %%xmm4, %%xmm3\n\t" \
02328             "movsd %%xmm6, %%xmm4\n\t" \
02329             \
02330             "movsd %%xmm6, %0\n\t"
02331             : "=m" (dpVect[iDataCntr])
02332             : "m0" (dpVect[iDataCntr]),
02333               "m" (dpCoeff[0]),
02334               "m" (dpCoeff[1]),
02335               "m" (dpCoeff[2]),
02336               "m" (dpCoeff[3]),
02337               "m" (dpCoeff[4])
02338             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02339               "memory");
02340     }
02341     X86_ASM (
02342         "movsd %%xmm0, %0\n\t" \
02343         "movsd %%xmm1, %1\n\t" \
02344         "movsd %%xmm2, %2\n\t" \
02345         "movsd %%xmm3, %3\n\t" \
02346         "movsd %%xmm4, %4\n\t"
02347         : "=m" (dpX[0]),
02348           "=m" (dpX[1]),
02349           "=m" (dpX[2]),
02350           "=m" (dpY[0]),
02351           "=m" (dpY[1])
02352         :
02353         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02354 }
02355 
02356 
02357 void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength, 
02358     const float *fpCoeff, float *fpX, float *fpY)
02359 {
02360     int iDataCntr;
02361     stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
02362     stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
02363     stpm64 m64pX = (stpm64) fpX;
02364     stpm64 m64pY = (stpm64) fpY;
02365 
02366     X86_ASM (
02367         "movq %0, %%mm0\n\t" \
02368         "pswapd %%mm0, %%mm2\n\t" \
02369         "movd %1, %%mm3\n\t" \
02370         "movq %2, %%mm0\n\t" \
02371         "pswapd %%mm0, %%mm4\n\t" \
02372         "movq %3, %%mm5\n\t" \
02373         "movq %4, %%mm7\n\t" \
02374         :
02375         : "m" (*m64pCoeff),
02376           "m" (fpCoeff[0]),
02377           "m" (*m64pCoeff2),
02378           "m" (*m64pX),
02379           "m" (*m64pY)
02380         : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
02381     for (iDataCntr = 0; 
02382         iDataCntr < iDataLength; 
02383         iDataCntr++)
02384     {
02385         X86_ASM (
02386             "pxor %%mm0, %%mm0\n\t" \
02387             "movd %1, %%mm6\n\t" \
02388             "movq %%mm5, %%mm1\n\t" \
02389             "pfmul %%mm2, %%mm1\n\t" \
02390             "pfacc %%mm1, %%mm0\n\t" \
02391             "movq %%mm6, %%mm1\n\t" \
02392             "pfmul %%mm3, %%mm1\n\t" \
02393             "pfacc %%mm1, %%mm0\n\t" \
02394             "movq %%mm7, %%mm1\n\t" \
02395             "pfmul %%mm4, %%mm1\n\t" \
02396             "pfacc %%mm1, %%mm0\n\t" \
02397             "pfacc %%mm0, %%mm0\n\t" \
02398             \
02399             "pswapd %%mm7, %%mm1\n\t" \
02400             "movq %%mm1, %%mm7\n\t" \
02401             "punpckldq %%mm0, %%mm7\n\t" \
02402             \
02403             "pswapd %%mm5, %%mm1\n\t" \
02404             "movq %%mm1, %%mm5\n\t" \
02405             "movq %%mm6, %%mm1\n\t" \
02406             "punpckldq %%mm1, %%mm5\n\t" \
02407             \
02408             "movd %%mm0, %0\n\t"
02409             : "=m" (fpDest[iDataCntr])
02410             : "m" (fpSrc[iDataCntr])
02411             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
02412     }
02413     X86_ASM (
02414         "movq %%mm5, %0\n\t" \
02415         "movd %%mm6, %1\n\t" \
02416         "movq %%mm7, %2\n\t"
02417         : "=m" (*m64pX),
02418           "=m" (fpX[2]),
02419           "=m" (*m64pY)
02420         :
02421         : "mm5", "mm6", "mm7", "memory");
02422     X86_ASM ("femms\n\t");
02423 }
02424 
02425 
02426 void dsp_x86_sse_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength, 
02427     const float *fpCoeff, float *fpX, float *fpY)
02428 {
02429     int iDataCntr;
02430     
02431     X86_ASM (
02432         "movss %0, %%xmm1\n\t" \
02433         "movss %1, %%xmm2\n\t" \
02434         "movss %2, %%xmm3\n\t" \
02435         "movss %3, %%xmm4\n\t" \
02436         "prefetchnta %4\n\t"
02437         :
02438         : "m" (fpX[1]),
02439           "m" (fpX[2]),
02440           "m" (fpY[0]),
02441           "m" (fpY[1]),
02442           "m" (fpCoeff[0])
02443         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02444     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02445     {
02446         X86_ASM (
02447             "movss %%xmm1, %%xmm0\n\t" \
02448             "movss %%xmm2, %%xmm1\n\t" \
02449             "movss %1, %%xmm2\n\t" \
02450             \
02451             "movss %2, %%xmm5\n\t" \
02452             "mulss %%xmm2, %%xmm5\n\t" \
02453             "movss %3, %%xmm6\n\t" \
02454             "mulss %%xmm1, %%xmm6\n\t" \
02455             "addss %%xmm6, %%xmm5\n\t" \
02456             "movss %4, %%xmm6\n\t" \
02457             "mulss %%xmm0, %%xmm6\n\t" \
02458             "addss %%xmm6, %%xmm5\n\t" \
02459             \
02460             "movss %5, %%xmm6\n\t" \
02461             "mulss %%xmm4, %%xmm6\n\t" \
02462             "movss %6, %%xmm7\n\t" \
02463             "mulss %%xmm3, %%xmm7\n\t" \
02464             "addss %%xmm7, %%xmm6\n\t" \
02465             \
02466             "addss %%xmm5, %%xmm6\n\t" \
02467             "movss %%xmm4, %%xmm3\n\t" \
02468             "movss %%xmm6, %%xmm4\n\t" \
02469             \
02470             "movss %%xmm6, %0\n\t"
02471             : "=m" (fpDest[iDataCntr])
02472             : "m" (fpSrc[iDataCntr]),
02473               "m" (fpCoeff[0]),
02474               "m" (fpCoeff[1]),
02475               "m" (fpCoeff[2]),
02476               "m" (fpCoeff[3]),
02477               "m" (fpCoeff[4])
02478             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02479               "memory");
02480     }
02481     X86_ASM (
02482         "movss %%xmm0, %0\n\t" \
02483         "movss %%xmm1, %1\n\t" \
02484         "movss %%xmm2, %2\n\t" \
02485         "movss %%xmm3, %3\n\t" \
02486         "movss %%xmm4, %4\n\t"
02487         : "=m" (fpX[0]),
02488           "=m" (fpX[1]),
02489           "=m" (fpX[2]),
02490           "=m" (fpY[0]),
02491           "=m" (fpY[1])
02492         :
02493         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02494 }
02495 
02496 
02497 void dsp_x86_sse_iir_nip (double *dpDest, const double *dpSrc, int iDataLength, 
02498     const double *dpCoeff, double *dpX, double *dpY)
02499 {
02500     int iDataCntr;
02501     
02502     X86_ASM (
02503         "movsd %0, %%xmm1\n\t" \
02504         "movsd %1, %%xmm2\n\t" \
02505         "movsd %2, %%xmm3\n\t" \
02506         "movsd %3, %%xmm4\n\t" \
02507         "prefetchnta %4\n\t" \
02508         "prefetchnta %5\n\t"
02509         :
02510         : "m" (dpX[1]),
02511           "m" (dpX[2]),
02512           "m" (dpY[0]),
02513           "m" (dpY[1]),
02514           "m" (dpCoeff[0]),
02515           "m" (dpCoeff[3])
02516         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02517     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02518     {
02519         X86_ASM (
02520             "movsd %%xmm1, %%xmm0\n\t" \
02521             "movsd %%xmm2, %%xmm1\n\t" \
02522             "movsd %1, %%xmm2\n\t" \
02523             \
02524             "movsd %2, %%xmm5\n\t" \
02525             "mulsd %%xmm2, %%xmm5\n\t" \
02526             "movsd %3, %%xmm6\n\t" \
02527             "mulsd %%xmm1, %%xmm6\n\t" \
02528             "addsd %%xmm6, %%xmm5\n\t" \
02529             "movsd %4, %%xmm6\n\t" \
02530             "mulsd %%xmm0, %%xmm6\n\t" \
02531             "addsd %%xmm6, %%xmm5\n\t" \
02532             \
02533             "movsd %5, %%xmm6\n\t" \
02534             "mulsd %%xmm4, %%xmm6\n\t" \
02535             "movsd %6, %%xmm7\n\t" \
02536             "mulsd %%xmm3, %%xmm7\n\t" \
02537             "addsd %%xmm7, %%xmm6\n\t" \
02538             \
02539             "addsd %%xmm5, %%xmm6\n\t" \
02540             "movsd %%xmm4, %%xmm3\n\t" \
02541             "movsd %%xmm6, %%xmm4\n\t" \
02542             \
02543             "movsd %%xmm6, %0\n\t"
02544             : "=m" (dpDest[iDataCntr])
02545             : "m" (dpSrc[iDataCntr]),
02546               "m" (dpCoeff[0]),
02547               "m" (dpCoeff[1]),
02548               "m" (dpCoeff[2]),
02549               "m" (dpCoeff[3]),
02550               "m" (dpCoeff[4])
02551             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02552               "memory");
02553     }
02554     X86_ASM (
02555         "movsd %%xmm0, %0\n\t" \
02556         "movsd %%xmm1, %1\n\t" \
02557         "movsd %%xmm2, %2\n\t" \
02558         "movsd %%xmm3, %3\n\t" \
02559         "movsd %%xmm4, %4\n\t"
02560         : "=m" (dpX[0]),
02561           "=m" (dpX[1]),
02562           "=m" (dpX[2]),
02563           "=m" (dpY[0]),
02564           "=m" (dpY[1])
02565         :
02566         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02567 }
02568 
02569 
02570 #ifdef __cplusplus
02571 }
02572 #endif
02573 
02574 #endif

Generated on Sun Nov 7 14:32:01 2004 for libDSP by doxygen 1.3.6