Main Page | Class Hierarchy | Alphabetical List | Compound List | File List | Compound Members | File Members

X86.c

Go to the documentation of this file.
00001 /*
00002 
00003     x86 specific optimized assembler dsp routines
00004     Copyright (C) 2001-2002 Jussi Laako
00005 
00006     This program is free software; you can redistribute it and/or modify
00007     it under the terms of the GNU General Public License as published by
00008     the Free Software Foundation; either version 2 of the License, or
00009     (at your option) any later version.
00010 
00011     This program is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014     GNU General Public License for more details.
00015 
00016     You should have received a copy of the GNU General Public License
00017     along with this program; if not, write to the Free Software
00018     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019 
00020 */
00021 
00022 
00023 #ifdef DSP_X86
00024 
00025 
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include <limits.h>
00029 #include <math.h>
00030 #include <float.h>
00031 
00032 #include "dsp/X86.h"
00033 
00034 
00035 static char cpCPUid[13];
00036 
00037 
00038 #ifdef __cplusplus
00039 extern "C"
00040 {
00041 #endif
00042 
00043 
00044 const char *dsp_x86_cpuid ()
00045 {
00046     unsigned int *ipCPUid = (unsigned int *) cpCPUid;
00047     
00048     X86_ASM (
00049         "pushl %%eax\n\t" \
00050         "pushl %%ebx\n\t" \
00051         "pushl %%ecx\n\t" \
00052         "pushl %%edx\n\t" \
00053         "xorl %%eax, %%eax\n\t" \
00054         "cpuid\n\t" \
00055         "movl %%ebx, %0\n\t" \
00056         "movl %%ecx, %2\n\t" \
00057         "movl %%edx, %1\n\t" \
00058         "popl %%edx\n\t" \
00059         "popl %%ecx\n\t" \
00060         "popl %%ebx\n\t" \
00061         "popl %%eax\n\t"
00062         : "=m" (ipCPUid[0]),
00063           "=m" (ipCPUid[1]),
00064           "=m" (ipCPUid[2])
00065         :
00066         : "eax", "ebx", "ecx", "edx", "memory");
00067     cpCPUid[12] = '\0';
00068 
00069     return cpCPUid;
00070 }
00071 
00072 
00073 unsigned int dsp_x86_features ()
00074 {
00075     unsigned int uiFeatures = 0;
00076     
00077     X86_ASM (
00078         "pushl %%eax\n\t" \
00079         "pushl %%ebx\n\t" \
00080         "pushl %%ecx\n\t" \
00081         "pushl %%edx\n\t" \
00082         "movl $1, %%eax\n\t" \
00083         "cpuid\n\t" \
00084         "movl %%edx, %0\n\t" \
00085         "popl %%edx\n\t" \
00086         "popl %%ecx\n\t" \
00087         "popl %%ebx\n\t" \
00088         "popl %%eax\n\t"
00089         : "=m" (uiFeatures)
00090         :
00091         : "eax", "ebx", "ecx", "edx", "memory");
00092     
00093     return uiFeatures;
00094 }
00095 
00096 
00097 unsigned int dsp_x86_amd_features ()
00098 {
00099     unsigned int uiFunction = 0x80000001;
00100     unsigned int uiFeatures = 0;
00101     
00102     X86_ASM (
00103         "pushl %%eax\n\t" \
00104         "pushl %%ebx\n\t" \
00105         "pushl %%ecx\n\t" \
00106         "pushl %%edx\n\t" \
00107         "movl %1, %%eax\n\t" \
00108         "cpuid\n\t" \
00109         "movl %%edx, %0\n\t" \
00110         "popl %%edx\n\t" \
00111         "popl %%ecx\n\t" \
00112         "popl %%ebx\n\t" \
00113         "popl %%eax\n\t"
00114         : "=m" (uiFeatures)
00115         : "m" (uiFunction)
00116         : "eax", "ebx", "ecx", "edx", "memory");
00117     
00118     return uiFeatures;
00119 }
00120 
00121 
00122 extern int dsp_x86_have_e3dnow ()
00123 {
00124     unsigned int uiFeatures;
00125 
00126     if (strcmp(dsp_x86_cpuid(), "AuthenticAMD") == 0)
00127     {
00128         uiFeatures = dsp_x86_amd_features();
00129         if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
00130             return 1;
00131     }
00132     return 0;
00133 }
00134 
00135 
00136 extern int dsp_x86_have_sse2 ()
00137 {
00138     unsigned int uiFeatures;
00139     
00140     uiFeatures = dsp_x86_features();
00141     if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
00142         return 1;
00143     return 0;
00144 }
00145 
00146 
00147 void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
00148 {
00149     int iStartIdx;
00150     int iDataCntr;
00151     int iDataCount;
00152     stpm64 m64pDest = (stpm64) fpDest;
00153     stpm64 m64pSrc = (stpm64) fpSrc;
00154     
00155     iStartIdx = 0;
00156     //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
00157     X86_ASM (
00158         "prefetchnta %0\n\t" \
00159         "prefetchnta %1\n\t" \
00160         "prefetchnta %2\n\t" \
00161         "prefetchnta %3\n\t"
00162         :
00163         : "m" (m64pSrc[0]),
00164           "m" (m64pSrc[8]),
00165           "m" (m64pSrc[16]),
00166           "m" (m64pSrc[24]));
00167     //#endif
00168     iDataCount = ((iDataLength & 0xfffffff0) >> 1);
00169     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00170     {
00171         //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
00172         X86_ASM (
00173             "prefetchnta %16\n\t" \
00174             "movq %8, %%mm0\n\t" \
00175             "movq %9, %%mm1\n\t" \
00176             "movq %10, %%mm2\n\t" \
00177             "movq %11, %%mm3\n\t" \
00178             "movq %12, %%mm4\n\t" \
00179             "movq %13, %%mm5\n\t" \
00180             "movq %14, %%mm6\n\t" \
00181             "movq %15, %%mm7\n\t" \
00182             "movntq %%mm0, %0\n\t" \
00183             "movntq %%mm1, %1\n\t" \
00184             "movntq %%mm2, %2\n\t" \
00185             "movntq %%mm3, %3\n\t" \
00186             "movntq %%mm4, %4\n\t" \
00187             "movntq %%mm5, %5\n\t" \
00188             "movntq %%mm6, %6\n\t" \
00189             "movntq %%mm7, %7\n\t"
00190             : "=m" (m64pDest[iDataCntr]),
00191               "=m" (m64pDest[iDataCntr + 1]),
00192               "=m" (m64pDest[iDataCntr + 2]),
00193               "=m" (m64pDest[iDataCntr + 3]),
00194               "=m" (m64pDest[iDataCntr + 4]),
00195               "=m" (m64pDest[iDataCntr + 5]),
00196               "=m" (m64pDest[iDataCntr + 6]),
00197               "=m" (m64pDest[iDataCntr + 7])
00198             : "m" (m64pSrc[iDataCntr]),
00199               "m" (m64pSrc[iDataCntr + 1]),
00200               "m" (m64pSrc[iDataCntr + 2]),
00201               "m" (m64pSrc[iDataCntr + 3]),
00202               "m" (m64pSrc[iDataCntr + 4]),
00203               "m" (m64pSrc[iDataCntr + 5]),
00204               "m" (m64pSrc[iDataCntr + 6]),
00205               "m" (m64pSrc[iDataCntr + 7]),
00206               "m" (m64pSrc[iDataCntr + 32])
00207             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00208         /*#else
00209         X86_ASM (
00210             "movq %8, %%mm0\n\t" \
00211             "movq %9, %%mm1\n\t" \
00212             "movq %10, %%mm2\n\t" \
00213             "movq %11, %%mm3\n\t" \
00214             "movq %12, %%mm4\n\t" \
00215             "movq %13, %%mm5\n\t" \
00216             "movq %14, %%mm6\n\t" \
00217             "movq %15, %%mm7\n\t" \
00218             "movntq %%mm0, %0\n\t" \
00219             "movntq %%mm1, %1\n\t" \
00220             "movntq %%mm2, %2\n\t" \
00221             "movntq %%mm3, %3\n\t" \
00222             "movntq %%mm4, %4\n\t" \
00223             "movntq %%mm5, %5\n\t" \
00224             "movntq %%mm6, %6\n\t" \
00225             "movntq %%mm7, %7\n\t"
00226             : "=m" (m64pDest[iDataCntr]),
00227               "=m" (m64pDest[iDataCntr + 1]),
00228               "=m" (m64pDest[iDataCntr + 2]),
00229               "=m" (m64pDest[iDataCntr + 3]),
00230               "=m" (m64pDest[iDataCntr + 4]),
00231               "=m" (m64pDest[iDataCntr + 5]),
00232               "=m" (m64pDest[iDataCntr + 6]),
00233               "=m" (m64pDest[iDataCntr + 7])
00234             : "m" (m64pSrc[iDataCntr]),
00235               "m" (m64pSrc[iDataCntr + 1]),
00236               "m" (m64pSrc[iDataCntr + 2]),
00237               "m" (m64pSrc[iDataCntr + 3]),
00238               "m" (m64pSrc[iDataCntr + 4]),
00239               "m" (m64pSrc[iDataCntr + 5]),
00240               "m" (m64pSrc[iDataCntr + 6]),
00241               "m" (m64pSrc[iDataCntr + 7]),
00242               "m" (m64pSrc[iDataCntr + 32])
00243             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00244         #endif*/
00245     }
00246     iStartIdx = iDataCount;
00247     iDataCount = ((iDataLength & 0xfffffffe) >> 1);
00248     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00249     {
00250         //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
00251         X86_ASM (
00252             "prefetchnta %2\n\t" \
00253             "movq %1, %%mm0\n\t" \
00254             "movntq %%mm0, %0\n\t"
00255             : "=m" (m64pDest[iDataCntr])
00256             : "m" (m64pSrc[iDataCntr]),
00257               "m" (m64pSrc[iDataCntr + 32])
00258             : "mm0", "memory");
00259         /*#else
00260         X86_ASM (
00261             "movq %1, %%mm0\n\t" \
00262             "movntq %%mm0, %0\n\t"
00263             : "=m" (m64pDest[iDataCntr])
00264             : "m" (m64pSrc[iDataCntr])
00265             : "mm0", "memory");
00266         #endif*/
00267     }
00268     if (iDataLength & 0x1)
00269     {
00270         X86_ASM (
00271             "movd %1, %%mm0\n\t" \
00272             "movd %%mm0, %0\n\t"
00273             : "=m" (fpDest[iDataLength - 1])
00274             : "m" (fpSrc[iDataLength - 1])
00275             : "mm0", "memory");
00276     }
00277     X86_ASM (
00278         "femms\n\t" \
00279         "sfence\n\t");
00280 }
00281 
00282 
00283 void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
00284 {
00285     int iStartIdx;
00286     int iDataCntr;
00287     int iDataCount;
00288     
00289     iStartIdx = 0;
00290     //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
00291     X86_ASM (
00292         "prefetchnta %0\n\t" \
00293         "prefetchnta %1\n\t" \
00294         "prefetchnta %2\n\t" \
00295         "prefetchnta %3\n\t"
00296         :
00297         : "m" (dpSrc[0]),
00298           "m" (dpSrc[8]),
00299           "m" (dpSrc[16]),
00300           "m" (dpSrc[24]));
00301     //#endif
00302     iDataCount = (iDataLength & 0xfffffff8);
00303     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00304     {
00305         //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
00306         X86_ASM (
00307             "prefetchnta %16\n\t" \
00308             "movq %8, %%mm0\n\t" \
00309             "movq %9, %%mm1\n\t" \
00310             "movq %10, %%mm2\n\t" \
00311             "movq %11, %%mm3\n\t" \
00312             "movq %12, %%mm4\n\t" \
00313             "movq %13, %%mm5\n\t" \
00314             "movq %14, %%mm6\n\t" \
00315             "movq %15, %%mm7\n\t" \
00316             "movntq %%mm0, %0\n\t" \
00317             "movntq %%mm1, %1\n\t" \
00318             "movntq %%mm2, %2\n\t" \
00319             "movntq %%mm3, %3\n\t" \
00320             "movntq %%mm4, %4\n\t" \
00321             "movntq %%mm5, %5\n\t" \
00322             "movntq %%mm6, %6\n\t" \
00323             "movntq %%mm7, %7\n\t"
00324             : "=m" (dpDest[iDataCntr]),
00325               "=m" (dpDest[iDataCntr + 1]),
00326               "=m" (dpDest[iDataCntr + 2]),
00327               "=m" (dpDest[iDataCntr + 3]),
00328               "=m" (dpDest[iDataCntr + 4]),
00329               "=m" (dpDest[iDataCntr + 5]),
00330               "=m" (dpDest[iDataCntr + 6]),
00331               "=m" (dpDest[iDataCntr + 7])
00332             : "m" (dpSrc[iDataCntr]),
00333               "m" (dpSrc[iDataCntr + 1]),
00334               "m" (dpSrc[iDataCntr + 2]),
00335               "m" (dpSrc[iDataCntr + 3]),
00336               "m" (dpSrc[iDataCntr + 4]),
00337               "m" (dpSrc[iDataCntr + 5]),
00338               "m" (dpSrc[iDataCntr + 6]),
00339               "m" (dpSrc[iDataCntr + 7]),
00340               "m" (dpSrc[iDataCntr + 32])
00341             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00342         /*#else
00343         X86_ASM (
00344             "movq %8, %%mm0\n\t" \
00345             "movq %9, %%mm1\n\t" \
00346             "movq %10, %%mm2\n\t" \
00347             "movq %11, %%mm3\n\t" \
00348             "movq %12, %%mm4\n\t" \
00349             "movq %13, %%mm5\n\t" \
00350             "movq %14, %%mm6\n\t" \
00351             "movq %15, %%mm7\n\t" \
00352             "movntq %%mm0, %0\n\t" \
00353             "movntq %%mm1, %1\n\t" \
00354             "movntq %%mm2, %2\n\t" \
00355             "movntq %%mm3, %3\n\t" \
00356             "movntq %%mm4, %4\n\t" \
00357             "movntq %%mm5, %5\n\t" \
00358             "movntq %%mm6, %6\n\t" \
00359             "movntq %%mm7, %7\n\t"
00360             : "=m" (dpDest[iDataCntr]),
00361               "=m" (dpDest[iDataCntr + 1]),
00362               "=m" (dpDest[iDataCntr + 2]),
00363               "=m" (dpDest[iDataCntr + 3]),
00364               "=m" (dpDest[iDataCntr + 4]),
00365               "=m" (dpDest[iDataCntr + 5]),
00366               "=m" (dpDest[iDataCntr + 6]),
00367               "=m" (dpDest[iDataCntr + 7])
00368             : "m" (dpSrc[iDataCntr]),
00369               "m" (dpSrc[iDataCntr + 1]),
00370               "m" (dpSrc[iDataCntr + 2]),
00371               "m" (dpSrc[iDataCntr + 3]),
00372               "m" (dpSrc[iDataCntr + 4]),
00373               "m" (dpSrc[iDataCntr + 5]),
00374               "m" (dpSrc[iDataCntr + 6]),
00375               "m" (dpSrc[iDataCntr + 7]),
00376               "m" (dpSrc[iDataCntr + 32])
00377             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00378         #endif*/
00379     }
00380     iStartIdx = iDataCount;
00381     iDataCount = iDataLength;
00382     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00383     {
00384         //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
00385         X86_ASM (
00386             "prefetchnta %2\n\t" \
00387             "movq %1, %%mm0\n\t" \
00388             "movntq %%mm0, %0\n\t"
00389             : "=m" (dpDest[iDataCntr])
00390             : "m" (dpSrc[iDataCntr]),
00391               "m" (dpSrc[iDataCntr + 32])
00392             : "mm0", "memory");
00393         /*#else
00394         X86_ASM (
00395             "movq %1, %%mm0\n\t" \
00396             "movntq %%mm0, %0\n\t"
00397             : "=m" (dpDest[iDataCntr])
00398             : "m" (dpSrc[iDataCntr])
00399             : "mm0", "memory");
00400         #endif*/
00401     }
00402     X86_ASM (
00403         "femms\n\t" \
00404         "sfence\n\t");
00405 }
00406 
00407 
00408 void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
00409 {
00410     int iDataCntr;
00411     int iDataCount;
00412     stpm64 m64pVect = (stpm64) fpVect;
00413     stm64 m64Src;
00414 
00415     m64Src.f[0] = m64Src.f[1] = fSrc;
00416     iDataCount = (iDataLength >> 1);
00417     X86_ASM (
00418         "movq %0, %%mm1\n\t"
00419         :
00420         : "m" (m64Src)
00421         : "mm1", "memory");
00422     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00423     {
00424         X86_ASM (
00425             "movq %1, %%mm0\n\t" \
00426             "pfadd %%mm1, %%mm0\n\t" \
00427             "movntq %%mm0, %0\n\t"
00428             : "=m" (m64pVect[iDataCntr])
00429             : "0" (m64pVect[iDataCntr])
00430             : "mm0", "mm1", "memory");
00431     }
00432     if (iDataLength & 0x1)
00433     {
00434         X86_ASM (
00435             "movd %1, %%mm0\n\t" \
00436             "pfadd %%mm1, %%mm0\n\t" \
00437             "movd %%mm0, %0\n\t"
00438             : "=m" (fpVect[iDataLength - 1])
00439             : "0" (fpVect[iDataLength - 1])
00440             : "mm0", "mm1", "memory");
00441     }
00442     X86_ASM (
00443         "femms\n\t" \
00444         "sfence\n\t");
00445 }
00446 
00447 
00448 void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
00449 {
00450     int iDataCntr;
00451     int iDataCount;
00452     stpm64 m64pVect = (stpm64) fpVect;
00453     stm64 m64Src;
00454 
00455     m64Src.f[0] = m64Src.f[1] = fSrc;
00456     iDataCount = (iDataLength >> 1);
00457     X86_ASM (
00458         "movq %0, %%mm1\n\t"
00459         :
00460         : "m" (m64Src)
00461         : "mm1", "memory");
00462     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00463     {
00464         X86_ASM (
00465             "movq %1, %%mm0\n\t" \
00466             "pfmul %%mm1, %%mm0\n\t" \
00467             "movntq %%mm0, %0\n\t"
00468             : "=m" (m64pVect[iDataCntr])
00469             : "0" (m64pVect[iDataCntr])
00470             : "mm0", "mm1", "memory");
00471     }
00472     if (iDataLength & 0x1)
00473     {
00474         X86_ASM (
00475             "movd %1, %%mm0\n\t" \
00476             "pfmul %%mm1, %%mm0\n\t" \
00477             "movd %%mm0, %0\n\t"
00478             : "=m" (fpVect[iDataLength - 1])
00479             : "0" (fpVect[iDataLength - 1])
00480             : "mm0", "mm1", "memory");
00481     }
00482     X86_ASM (
00483         "femms\n\t" \
00484         "sfence\n\t");
00485 }
00486 
00487 
00488 void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2, 
00489     int iDataLength)
00490 {
00491     int iDataCntr;
00492     int iDataCount;
00493     stpm64 m64pDest = (stpm64) fpDest;
00494     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00495     stm64 m64Src2;
00496 
00497     m64Src2.f[0] = m64Src2.f[1] = fSrc2;
00498     iDataCount = (iDataLength >> 1);
00499     X86_ASM (
00500         "movq %0, %%mm1\n\t"
00501         :
00502         : "m" (m64Src2)
00503         : "mm1", "memory");
00504     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00505     {
00506         X86_ASM (
00507             "movq %1, %%mm0\n\t" \
00508             "pfmul %%mm1, %%mm0\n\t" \
00509             "movntq %%mm0, %0\n\t"
00510             : "=m" (m64pDest[iDataCntr])
00511             : "m" (m64pSrc1[iDataCntr])
00512             : "mm0", "mm1", "memory");
00513     }
00514     if (iDataLength & 0x1)
00515     {
00516         X86_ASM (
00517             "movd %1, %%mm0\n\t" \
00518             "pfmul %%mm1, %%mm0\n\t" \
00519             "movd %%mm0, %0\n\t"
00520             : "=m" (fpDest[iDataLength - 1])
00521             : "m" (fpSrc1[iDataLength - 1])
00522             : "mm0", "mm1", "memory");
00523     }
00524     X86_ASM (
00525         "femms\n\t" \
00526         "sfence\n\t");
00527 }
00528 
00529 
00530 void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00531 {
00532     int iDataCntr;
00533     int iDataCount;
00534     stpm64 m64pDest = (stpm64) fpDest;
00535     stpm64 m64pSrc = (stpm64) fpSrc;
00536 
00537     iDataCount = (iDataLength >> 1);
00538     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00539     {
00540         X86_ASM (
00541             "movq %1, %%mm0\n\t" \
00542             "movq %2, %%mm1\n\t" \
00543             "pfadd %%mm1, %%mm0\n\t" \
00544             "movntq %%mm0, %0\n\t"
00545             : "=m" (m64pDest[iDataCntr])
00546             : "0" (m64pDest[iDataCntr]),
00547               "m" (m64pSrc[iDataCntr])
00548             : "mm0", "mm1", "memory");
00549     }
00550     if (iDataLength & 0x1)
00551     {
00552         X86_ASM (
00553             "movd %1, %%mm0\n\t" \
00554             "movd %2, %%mm1\n\t" \
00555             "pfadd %%mm1, %%mm0\n\t" \
00556             "movd %%mm0, %0\n\t"
00557             : "=m" (fpDest[iDataLength - 1])
00558             : "0" (fpDest[iDataLength - 1]),
00559               "m" (fpSrc[iDataLength - 1])
00560             : "mm0", "mm1", "memory");
00561     }
00562     X86_ASM (
00563         "femms\n\t" \
00564         "sfence\n\t");
00565 }
00566 
00567 
00568 void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00569 {
00570     int iDataCntr;
00571     int iDataCount;
00572     stpm64 m64pDest = (stpm64) fpDest;
00573     stpm64 m64pSrc = (stpm64) fpSrc;
00574 
00575     iDataCount = (iDataLength >> 1);
00576     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00577     {
00578         X86_ASM (
00579             "movq %1, %%mm0\n\t" \
00580             "movq %2, %%mm1\n\t" \
00581             "pfmul %%mm1, %%mm0\n\t" \
00582             "movntq %%mm0, %0\n\t"
00583             : "=m" (m64pDest[iDataCntr])
00584             : "0" (m64pDest[iDataCntr]),
00585               "m" (m64pSrc[iDataCntr])
00586             : "mm0", "mm1", "memory");
00587     }
00588     if (iDataLength & 0x1)
00589     {
00590         X86_ASM (
00591             "movd %1, %%mm0\n\t" \
00592             "movd %2, %%mm1\n\t" \
00593             "pfmul %%mm1, %%mm0\n\t" \
00594             "movd %%mm0, %0\n\t"
00595             : "=m" (fpDest[iDataLength - 1])
00596             : "0" (fpDest[iDataLength - 1]),
00597               "m" (fpSrc[iDataLength - 1])
00598             : "mm0", "mm1", "memory");
00599     }
00600     X86_ASM (
00601         "femms\n\t" \
00602         "sfence\n\t");
00603 }
00604 
00605 
00606 void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1, 
00607     const float *fpSrc2, int iDataLength)
00608 {
00609     int iDataCntr;
00610     int iDataCount;
00611     stpm64 m64pDest = (stpm64) fpDest;
00612     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00613     stpm64 m64pSrc2 = (stpm64) fpSrc2;
00614 
00615     iDataCount = (iDataLength >> 1);
00616     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00617     {
00618         X86_ASM (
00619             "movq %1, %%mm0\n\t" \
00620             "movq %2, %%mm1\n\t" \
00621             "pfadd %%mm1, %%mm0\n\t" \
00622             "movntq %%mm0, %0\n\t"
00623             : "=m" (m64pDest[iDataCntr])
00624             : "m" (m64pSrc1[iDataCntr]),
00625               "m" (m64pSrc2[iDataCntr])
00626             : "mm0", "mm1", "memory");
00627     }
00628     if (iDataLength & 0x1)
00629     {
00630         X86_ASM (
00631             "movd %1, %%mm0\n\t" \
00632             "movd %2, %%mm1\n\t" \
00633             "pfadd %%mm1, %%mm0\n\t" \
00634             "movd %%mm0, %0\n\t"
00635             : "=m" (fpDest[iDataLength - 1])
00636             : "m" (fpSrc1[iDataLength - 1]),
00637               "m" (fpSrc2[iDataLength - 1])
00638             : "mm0", "mm1", "memory");
00639     }
00640     X86_ASM (
00641         "femms\n\t" \
00642         "sfence\n\t");
00643 }
00644 
00645 
00646 void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1, 
00647     const float *fpSrc2, int iDataLength)
00648 {
00649     int iDataCntr;
00650     int iDataCount;
00651     stpm64 m64pDest = (stpm64) fpDest;
00652     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00653     stpm64 m64pSrc2 = (stpm64) fpSrc2;
00654 
00655     iDataCount = (iDataLength >> 1);
00656     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00657     {
00658         X86_ASM (
00659             "movq %1, %%mm0\n\t" \
00660             "movq %2, %%mm1\n\t" \
00661             "pfmul %%mm1, %%mm0\n\t" \
00662             "movntq %%mm0, %0\n\t"
00663             : "=m" (m64pDest[iDataCntr])
00664             : "m" (m64pSrc1[iDataCntr]),
00665               "m" (m64pSrc2[iDataCntr])
00666             : "mm0", "mm1", "memory");
00667     }
00668     if (iDataLength & 0x1)
00669     {
00670         X86_ASM (
00671             "movd %1, %%mm0\n\t" \
00672             "movd %2, %%mm1\n\t" \
00673             "pfmul %%mm1, %%mm0\n\t" \
00674             "movd %%mm0, %0\n\t"
00675             : "=m" (fpDest[iDataLength - 1])
00676             : "m" (fpSrc1[iDataLength - 1]),
00677               "m" (fpSrc2[iDataLength - 1])
00678             : "mm0", "mm1", "memory");
00679     }
00680     X86_ASM (
00681         "femms\n\t" \
00682         "sfence\n\t");
00683 }
00684 
00685 
00686 void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00687 {
00688     int iDataCntr;
00689     stpm64 m64pDest = (stpm64) fpDest;
00690     
00691     X86_ASM (
00692         "movq %0, %%mm3\n\t"
00693         :
00694         : "m" (fpSrc)
00695         : "mm3", "memory");
00696     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00697     {
00698         X86_ASM (
00699             "movq %1, %%mm0\n\t" \
00700             "movq %%mm3, %%mm1\n\t" \
00701             "pswapd %%mm1, %%mm2\n\t" \
00702             "pfmul %%mm0, %%mm1\n\t" \
00703             "pfmul %%mm0, %%mm2\n\t" \
00704             "pfpnacc %%mm2, %%mm1\n\t"
00705             "movntq %%mm1, %0\n\t"
00706             : "=m" (m64pDest[iDataCntr])
00707             : "0" (m64pDest[iDataCntr])
00708             : "mm0", "mm1", "mm2", "mm3", "memory");
00709     }
00710     X86_ASM (
00711         "femms\n\t" \
00712         "sfence\n\t");
00713 }
00714 
00715 
00716 void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
00717 {
00718     int iDataCntr;
00719     stpm64 m64pDest = (stpm64) fpDest;
00720     stpm64 m64pSrc = (stpm64) fpSrc;
00721     
00722     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00723     {
00724         X86_ASM (
00725             "movq %1, %%mm0\n\t" \
00726             "movq %2, %%mm1\n\t" \
00727             "pswapd %%mm1, %%mm2\n\t" \
00728             "pfmul %%mm0, %%mm1\n\t" \
00729             "pfmul %%mm0, %%mm2\n\t" \
00730             "pfpnacc %%mm2, %%mm1\n\t"
00731             "movntq %%mm1, %0\n\t"
00732             : "=m" (m64pDest[iDataCntr])
00733             : "0" (m64pDest[iDataCntr]),
00734               "m" (m64pSrc[iDataCntr])
00735             : "mm0", "mm1", "mm2", "memory");
00736     }
00737     X86_ASM (
00738         "femms\n\t" \
00739         "sfence\n\t");
00740 }
00741 
00742 
00743 void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1, 
00744     const float *fpSrc2, int iDataLength)
00745 {
00746     int iDataCntr;
00747     stpm64 m64pDest = (stpm64) fpDest;
00748     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00749     stpm64 m64pSrc2 = (stpm64) fpSrc2;
00750     
00751     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00752     {
00753         X86_ASM (
00754             "movq %1, %%mm0\n\t" \
00755             "movq %2, %%mm1\n\t" \
00756             "pswapd %%mm1, %%mm2\n\t" \
00757             "pfmul %%mm0, %%mm1\n\t" \
00758             "pfmul %%mm0, %%mm2\n\t" \
00759             "pfpnacc %%mm2, %%mm1\n\t"
00760             "movntq %%mm1, %0\n\t"
00761             : "=m" (m64pDest[iDataCntr])
00762             : "m" (m64pSrc1[iDataCntr]),
00763               "m" (m64pSrc2[iDataCntr])
00764             : "mm0", "mm1", "mm2", "memory");
00765     }
00766     X86_ASM (
00767         "femms\n\t" \
00768         "sfence\n\t");
00769 }
00770 
00771 
00772 void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
00773 {
00774     int iDataCntr;
00775     int iDataCount;
00776     stpm64 m64pVect = (stpm64) fpVect;
00777     stm64 m64Mul;
00778     stm64 m64Add;
00779 
00780     m64Mul.f[0] = m64Mul.f[1] = fMul;
00781     m64Add.f[0] = m64Add.f[1] = fAdd;
00782     iDataCount = (iDataLength >> 1);
00783     X86_ASM (
00784         "movq %0, %%mm1\n\t" \
00785         "movq %1, %%mm2\n\t"
00786         :
00787         : "m" (m64Mul),
00788           "m" (m64Add)
00789         : "mm1", "mm2", "memory");
00790     /*X86_ASM (
00791         "movd %0, %%mm1\n\t" \
00792         "pswapd %%mm1, %%mm3\n\t" \
00793         "pfadd %%mm3, %%mm1\n\t" \
00794         "movd %1, %%mm2\n\t" \
00795         "pswapd %%mm2, %%mm3\n\t" \
00796         "pfadd %%mm3, %%mm2\n\t"
00797         :
00798         : "m" (fMul),
00799           "m" (fAdd)
00800         : "mm1", "mm2", "mm3", "memory");*/
00801     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00802     {
00803         X86_ASM (
00804             "movq %1, %%mm0\n\t" \
00805             "pfmul %%mm1, %%mm0\n\t" \
00806             "pfadd %%mm2, %%mm0\n\t" \
00807             "movntq %%mm0, %0\n\t"
00808             : "=m" (m64pVect[iDataCntr])
00809             : "0" (m64pVect[iDataCntr])
00810             : "mm0", "mm1", "mm2", "memory");
00811     }
00812     if (iDataLength & 0x1)
00813     {
00814         X86_ASM (
00815             "movd %1, %%mm0\n\t" \
00816             "pfmul %%mm1, %%mm0\n\t" \
00817             "pfadd %%mm2, %%mm0\n\t" \
00818             "movd %%mm0, %0\n\t"
00819             : "=m" (fpVect[iDataLength - 1])
00820             : "0" (fpVect[iDataLength - 1])
00821             : "mm0", "mm1", "mm2", "memory");
00822     }
00823     X86_ASM (
00824         "femms\n\t" \
00825         "sfence\n\t");
00826 }
00827 
00828 
00829 void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
00830     float fMul, float fAdd, int iDataLength)
00831 {
00832     int iDataCntr;
00833     int iDataCount;
00834     stpm64 m64pDest = (stpm64) fpDest;
00835     stpm64 m64pSrc = (stpm64) fpSrc;
00836     stm64 m64Mul;
00837     stm64 m64Add;
00838 
00839     m64Mul.f[0] = m64Mul.f[1] = fMul;
00840     m64Add.f[0] = m64Add.f[1] = fAdd;
00841     iDataCount = (iDataLength >> 1);
00842     X86_ASM (
00843         "movq %0, %%mm1\n\t" \
00844         "movq %1, %%mm2\n\t"
00845         :
00846         : "m" (m64Mul),
00847           "m" (m64Add)
00848         : "mm1", "mm2", "memory");
00849     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00850     {
00851         X86_ASM (
00852             "movq %1, %%mm0\n\t" \
00853             "pfmul %%mm1, %%mm0\n\t" \
00854             "pfadd %%mm2, %%mm0\n\t" \
00855             "movntq %%mm0, %0\n\t"
00856             : "=m" (m64pDest[iDataCntr])
00857             : "m" (m64pSrc[iDataCntr])
00858             : "mm0", "mm1", "mm2", "memory");
00859     }
00860     if (iDataLength & 0x1)
00861     {
00862         X86_ASM (
00863             "movd %1, %%mm0\n\t" \
00864             "pfmul %%mm1, %%mm0\n\t" \
00865             "pfadd %%mm2, %%mm0\n\t" \
00866             "movd %%mm0, %0\n\t"
00867             : "=m" (fpDest[iDataLength - 1])
00868             : "m" (fpSrc[iDataLength - 1])
00869             : "mm0", "mm1", "mm2", "memory");
00870     }
00871     X86_ASM (
00872         "femms\n\t" \
00873         "sfence\n\t");
00874 }
00875 
00876 
00877 void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
00878 {
00879     int iDataCntr;
00880     int iDataCount;
00881     stpm64 m64pVect = (stpm64) fpVect;
00882     stm64 m64Add;
00883     stm64 m64Mul;
00884 
00885     m64Add.f[0] = m64Add.f[1] = fAdd;
00886     m64Mul.f[0] = m64Mul.f[1] = fMul;
00887     iDataCount = (iDataLength >> 1);
00888     X86_ASM (
00889         "movq %0, %%mm1\n\t" \
00890         "movq %1, %%mm2\n\t"
00891         :
00892         : "m" (m64Add),
00893           "m" (m64Mul)
00894         : "mm1", "mm2", "memory");
00895     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00896     {
00897         X86_ASM (
00898             "movq %1, %%mm0\n\t" \
00899             "pfadd %%mm1, %%mm0\n\t" \
00900             "pfmul %%mm2, %%mm0\n\t" \
00901             "movntq %%mm0, %0\n\t"
00902             : "=m" (m64pVect[iDataCntr])
00903             : "0" (m64pVect[iDataCntr])
00904             : "mm0", "mm1", "mm2", "memory");
00905     }
00906     if (iDataLength & 0x1)
00907     {
00908         X86_ASM (
00909             "movd %1, %%mm0\n\t" \
00910             "pfadd %%mm1, %%mm0\n\t" \
00911             "pfmul %%mm2, %%mm0\n\t" \
00912             "movd %%mm0, %0\n\t"
00913             : "=m" (fpVect[iDataLength - 1])
00914             : "0" (fpVect[iDataLength - 1])
00915             : "mm0", "mm1", "mm2", "memory");
00916     }
00917     X86_ASM (
00918         "femms\n\t" \
00919         "sfence\n\t");
00920 }
00921 
00922 
00923 float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2, 
00924     int iDataLength)
00925 {
00926     int iDataCntr;
00927     int iDataCount;
00928     float fRes;
00929     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00930     stpm64 m64pSrc2 = (stpm64) fpSrc2;
00931 
00932     iDataCount = (iDataLength >> 1);
00933     X86_ASM (
00934         "pxor %%mm0, %%mm0\n\t"
00935         :
00936         :
00937         : "mm0");
00938     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00939     {
00940         X86_ASM (
00941             "movq %0, %%mm1\n\t" \
00942             "movq %1, %%mm2\n\t" \
00943             "pfmul %%mm2, %%mm1\n\t" \
00944             "pfacc %%mm1, %%mm0\n\t"
00945             :
00946             : "m" (m64pSrc1[iDataCntr]),
00947               "m" (m64pSrc2[iDataCntr])
00948             : "mm0", "mm1", "mm2", "memory");
00949     }
00950     if (iDataLength & 0x1)
00951     {
00952         X86_ASM (
00953             "movd %0, %%mm1\n\t" \
00954             "movd %1, %%mm2\n\t" \
00955             "pfmul %%mm2, %%mm1\n\t" \
00956             "pfacc %%mm1, %%mm0\n\t"
00957             :
00958             : "m" (fpSrc1[iDataLength - 1]),
00959               "m" (fpSrc2[iDataLength - 1])
00960             : "mm0", "mm1", "mm2", "memory");
00961     }
00962     X86_ASM (
00963         "pfacc %%mm0, %%mm0\n\t" \
00964         "movd %%mm0, %0\n\t"
00965         : "=m" (fRes)
00966         :
00967         : "mm0", "memory");
00968     X86_ASM ("femms\n\t");
00969 
00970     return fRes;
00971 }
00972 
00973 
00974 void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc, 
00975     int iDataLength)
00976 {
00977     int iDataCntr;
00978     int iDataCount;
00979     stm64 m64Min;
00980     stm64 m64Max;
00981     stpm64 m64pSrc = (stpm64) fpSrc;
00982     
00983     m64Min.f[0] = m64Min.f[1] = FLT_MAX;
00984     m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
00985     iDataCount = (iDataLength >> 1);
00986     X86_ASM (
00987         "movq %0, %%mm1\n\t" \
00988         "movq %1, %%mm2\n\t"
00989         :
00990         : "m" (m64Min),
00991           "m" (m64Max)
00992         : "mm1", "mm2", "memory");
00993     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00994     {
00995         X86_ASM (
00996             "movq %0, %%mm0\n\t" \
00997             "pfmin %%mm0, %%mm1\n\t" \
00998             "pfmax %%mm0, %%mm2\n\t"
00999             :
01000             : "m" (m64pSrc[iDataCntr])
01001             : "mm0", "mm1", "mm2", "memory");
01002     }
01003     if (iDataLength & 0x1)
01004     {
01005         X86_ASM (
01006             "movd %0, %%mm0\n\t" \
01007             "pfmin %%mm0, %%mm1\n\t" \
01008             "pfmax %%mm0, %%mm2\n\t"
01009             :
01010             : "m" (fpSrc[iDataLength - 1])
01011             : "mm0", "mm1", "mm2", "memory");
01012     }
01013     X86_ASM (
01014         "pswapd %%mm1, %%mm3\n\t" \
01015         "pfmin %%mm3, %%mm1\n\t" \
01016         "pswapd %%mm2, %%mm3\n\t" \
01017         "pfmax %%mm3, %%mm2\n\t" \
01018         "movd %%mm1, %0\n\t" \
01019         "movd %%mm2, %1\n\t"
01020         : "=m" (*fpMin),
01021           "=m" (*fpMax)
01022         :
01023         : "mm1", "mm2", "mm3", "memory");
01024     X86_ASM ("femms\n\t");
01025 }
01026 
01027 
01028 float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01029     int iDataLength)
01030 {
01031     int iDataCntr;
01032     int iDataCount;
01033     float fRes;
01034     stpm64 m64pSrc1 = (stpm64) fpSrc1;
01035     stpm64 m64pSrc2 = (stpm64) fpSrc2;
01036     
01037     iDataCount = (iDataLength >> 1);
01038     X86_ASM (
01039         "pxor %%mm3, %%mm3\n\t" \
01040         "pxor %%mm4, %%mm4\n\t" \
01041         "pxor %%mm5, %%mm5\n\t"
01042         :
01043         :
01044         : "mm3", "mm4", "mm5");
01045     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01046     {
01047         X86_ASM (
01048             "movq %0, %%mm0\n\t" \
01049             "movq %1, %%mm1\n\t" \
01050             "movq %%mm1, %%mm2\n\t" \
01051             "pfmul %%mm0, %%mm2\n\t" \
01052             "pfacc %%mm2, %%mm5\n\t" \
01053             "pfmul %%mm0, %%mm0\n\t" \
01054             "pfacc %%mm0, %%mm3\n\t" \
01055             "pfmul %%mm1, %%mm1\n\t" \
01056             "pfacc %%mm1, %%mm4\n\t"
01057             :
01058             : "m" (m64pSrc1[iDataCntr]),
01059               "m" (m64pSrc2[iDataCntr])
01060             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01061     }
01062     if (iDataLength & 0x1)
01063     {
01064         X86_ASM (
01065             "movd %0, %%mm0\n\t" \
01066             "movd %1, %%mm1\n\t" \
01067             "movq %%mm1, %%mm2\n\t" \
01068             "pfmul %%mm0, %%mm2\n\t" \
01069             "pfacc %%mm2, %%mm5\n\t" \
01070             "pfmul %%mm0, %%mm0\n\t" \
01071             "pfacc %%mm0, %%mm3\n\t" \
01072             "pfmul %%mm1, %%mm1\n\t" \
01073             "pfacc %%mm1, %%mm4\n\t"
01074             :
01075             : "m" (fpSrc1[iDataLength - 1]),
01076               "m" (fpSrc2[iDataLength - 1])
01077             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01078     }
01079     X86_ASM (
01080         "pfacc %%mm3, %%mm3\n\t" \
01081         "pfacc %%mm4, %%mm4\n\t" \
01082         "pfacc %%mm5, %%mm5\n\t" \
01083         \
01084         "movd %1, %%mm6\n\t" \
01085         "pswapd %%mm6, %%mm7\n\t" \
01086         "paddd %%mm7, %%mm6\n\t" \
01087         "pi2fd %%mm6, %%mm7\n\t" \
01088         \
01089         "pfrcp %%mm7, %%mm6\n\t" \
01090         "pfrcpit1 %%mm6, %%mm7\n\t" \
01091         "pfrcpit2 %%mm6, %%mm7\n\t" \
01092         \
01093         "pfmul %%mm3, %%mm4\n\t" \
01094         \
01095         "movq %%mm4, %%mm0\n\t" \
01096         "pfrsqrt %%mm4, %%mm1\n\t" \
01097         "movq %%mm1, %%mm2\n\t" \
01098         "pfmul %%mm1, %%mm1\n\t" \
01099         "pfrsqit1 %%mm4, %%mm1\n\t" \
01100         "pfrcpit2 %%mm2, %%mm1\n\t" \
01101         "pfmul %%mm1, %%mm4\n\t" \
01102         \
01103         "pfmul %%mm6, %%mm4\n\t" \
01104         \
01105         "pfrcp %%mm4, %%mm0\n\t" \
01106         "pfrcpit1 %%mm0, %%mm4\n\t" \
01107         "pfrcpit2 %%mm0, %%mm4\n\t" \
01108         \
01109         "pfmul %%mm6, %%mm5\n\t" \
01110         "pfmul %%mm4, %%mm5\n\t" \
01111         "movd %%mm5, %0\n\t"
01112         : "=m" (fRes)
01113         : "m" (iDataLength)
01114         : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01115     X86_ASM ("femms\n\t");
01116 
01117     return fRes;
01118 }
01119 
01120 
01121 void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
01122     int iIntMax)
01123 {
01124     int iDataCntr;
01125     float fScale;
01126     
01127     X86_ASM (
01128         "movd %1, %%mm1\n\t" \
01129         "pswapd %%mm1, %%mm2\n\t" \
01130         "paddd %%mm2, %%mm1\n\t" \
01131         "pi2fd %%mm1, %%mm1\n\t" \
01132         "pfrcp %%mm1, %%mm2\n\t" \
01133         "pfrcpit1 %%mm2, %%mm1\n\t" \
01134         "pfrcpit2 %%mm2, %%mm1\n\t" \
01135         "movd %%mm1, %0\n\t"
01136         : "=m" (fScale)
01137         : "m" (iIntMax)
01138         : "mm1", "mm2", "memory");
01139     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01140     {
01141         X86_ASM (
01142             "movd %1, %%mm0\n\t" \
01143             "punpcklwd %%mm0, %%mm0\n\t" \
01144             "pi2fw %%mm0, %%mm0\n\t" \
01145             "pfmul %%mm1, %%mm0\n\t" \
01146             "movntq %%mm0, %0\n\t"
01147             : "=m" (fpDest[iDataCntr])
01148             : "m" (ipSrc[iDataCntr])
01149             : "mm0", "mm1", "memory");
01150     }
01151     X86_ASM (
01152         "femms\n\t" \
01153         "sfence\n\t");
01154     if ((iDataLength % 2) != 0)
01155     {
01156         fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01157     }
01158 }
01159 
01160 
01161 void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
01162     int iIntMax)
01163 {
01164     int iDataCntr;
01165     float fScale;
01166     
01167     X86_ASM (
01168         "movd %1, %%mm1\n\t" \
01169         "pswapd %%mm1, %%mm2\n\t" \
01170         "paddd %%mm2, %%mm1\n\t" \
01171         "pi2fd %%mm1, %%mm1\n\t" \
01172         "pfrcp %%mm1, %%mm2\n\t" \
01173         "pfrcpit1 %%mm2, %%mm1\n\t" \
01174         "pfrcpit2 %%mm2, %%mm1\n\t" \
01175         "movd %%mm1, %0\n\t"
01176         : "=m" (fScale)
01177         : "m" (iIntMax)
01178         : "mm1", "mm2", "memory");
01179     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01180     {
01181         X86_ASM (
01182             "movq %1, %%mm0\n\t" \
01183             "pi2fd %%mm0, %%mm0\n\t" \
01184             "pfmul %%mm1, %%mm0\n\t" \
01185             "movntq %%mm0, %0\n\t"
01186             : "=m" (fpDest[iDataCntr])
01187             : "m" (ipSrc[iDataCntr])
01188             : "mm0", "mm1", "memory");
01189     }
01190     X86_ASM (
01191         "femms\n\t" \
01192         "sfence\n\t");
01193     if ((iDataLength % 2) != 0)
01194     {
01195         fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01196     }
01197 }
01198 
01199 
01200 void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength, 
01201     const float *fpCoeff, int iCoeffLength)
01202 {
01203     int iSrcCntr;
01204     int iDestCntr;
01205     int iCoeffCntr;
01206     int iSrcCount;
01207     stpm64 m64pDest = (stpm64) fpDest;
01208 
01209     iDestCntr = 0;
01210     iSrcCount = iDataLength + iCoeffLength;
01211     for (iSrcCntr = iCoeffLength; 
01212         iSrcCntr < iSrcCount; 
01213         iSrcCntr += 2)
01214     {
01215         X86_ASM (
01216             "pxor %%mm0, %%mm0\n\t" 
01217             :
01218             :
01219             : "mm0");
01220         for (iCoeffCntr = 0; 
01221             iCoeffCntr < iCoeffLength;
01222             iCoeffCntr++)
01223         {
01224             X86_ASM (
01225                 "movq %0, %%mm1\n\t" \
01226                 "movd %1, %%mm2\n\t" \
01227                 "pswapd %%mm2, %%mm3\n\t" \
01228                 "pfadd %%mm3, %%mm2\n\t" \
01229                 "pfmul %%mm2, %%mm1\n\t" \
01230                 "pfadd %%mm1, %%mm0\n\t" 
01231                 :
01232                 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
01233                   "m" (fpCoeff[iCoeffCntr])
01234                 : "mm0", "mm1", "mm2", "mm3", "memory");
01235         }
01236         X86_ASM (
01237             "movntq %%mm0, %0\n\t"
01238             : "=m" (m64pDest[iDestCntr++])
01239             :
01240             : "mm0", "memory");
01241     }
01242     if (iDataLength & 0x1)
01243     {
01244         X86_ASM (
01245             "pxor %%mm0, %%mm0\n\t" 
01246             :
01247             :
01248             : "mm0");
01249         for (iCoeffCntr = 0; 
01250             iCoeffCntr < iCoeffLength;
01251             iCoeffCntr++)
01252         {
01253             X86_ASM (
01254                 "movd %0, %%mm1\n\t" \
01255                 "movd %1, %%mm2\n\t" \
01256                 "pfmul %%mm2, %%mm1\n\t" \
01257                 "pfadd %%mm1, %%mm0\n\t" 
01258                 :
01259                 : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
01260                   "m" (fpCoeff[iCoeffCntr])
01261                 : "mm0", "mm1", "mm2", "memory");
01262         }
01263         X86_ASM (
01264             "movd %%mm0, %0\n\t"
01265             : "=m" (fpDest[iDataLength - 1])
01266             :
01267             : "mm0", "memory");
01268     }
01269     X86_ASM (
01270         "femms\n\t" \
01271         "sfence\n\t");
01272 }
01273 
01274 
01275 void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff, 
01276     float *fpX, float *fpY)
01277 {
01278     int iDataCntr;
01279     stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
01280     stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
01281     stpm64 m64pX = (stpm64) fpX;
01282     stpm64 m64pY = (stpm64) fpY;
01283 
01284     X86_ASM (
01285         "movq %0, %%mm0\n\t" \
01286         "pswapd %%mm0, %%mm2\n\t" \
01287         "movd %1, %%mm3\n\t" \
01288         "movq %2, %%mm0\n\t" \
01289         "pswapd %%mm0, %%mm4\n\t" \
01290         "movq %3, %%mm5\n\t" \
01291         "movq %4, %%mm7\n\t" \
01292         :
01293         : "m" (*m64pCoeff),
01294           "m" (fpCoeff[0]),
01295           "m" (*m64pCoeff2),
01296           "m" (*m64pX),
01297           "m" (*m64pY)
01298         : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
01299     for (iDataCntr = 0; 
01300         iDataCntr < iDataLength; 
01301         iDataCntr++)
01302     {
01303         X86_ASM (
01304             "pxor %%mm0, %%mm0\n\t" \
01305             "movd %1, %%mm6\n\t" \
01306             "movq %%mm5, %%mm1\n\t" \
01307             "pfmul %%mm2, %%mm1\n\t" \
01308             "pfacc %%mm1, %%mm0\n\t" \
01309             "movq %%mm6, %%mm1\n\t" \
01310             "pfmul %%mm3, %%mm1\n\t" \
01311             "pfacc %%mm1, %%mm0\n\t" \
01312             "movq %%mm7, %%mm1\n\t" \
01313             "pfmul %%mm4, %%mm1\n\t" \
01314             "pfacc %%mm1, %%mm0\n\t" \
01315             "pfacc %%mm0, %%mm0\n\t" \
01316             \
01317             "pswapd %%mm7, %%mm1\n\t" \
01318             "movq %%mm1, %%mm7\n\t" \
01319             "punpckldq %%mm0, %%mm7\n\t" \
01320             \
01321             "pswapd %%mm5, %%mm1\n\t" \
01322             "movq %%mm1, %%mm5\n\t" \
01323             "movq %%mm6, %%mm1\n\t" \
01324             "punpckldq %%mm1, %%mm5\n\t" \
01325             \
01326             "movd %%mm0, %0\n\t"
01327             : "=m" (fpVect[iDataCntr])
01328             : "0" (fpVect[iDataCntr])
01329             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01330     }
01331     X86_ASM (
01332         "movq %%mm5, %0\n\t" \
01333         "movd %%mm6, %1\n\t" \
01334         "movq %%mm7, %2\n\t"
01335         : "=m" (*m64pX),
01336           "=m" (fpX[2]),
01337           "=m" (*m64pY)
01338         :
01339         : "mm5", "mm6", "mm7", "memory");
01340     X86_ASM ("femms\n\t");
01341 }
01342 
01343 
01344 void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength, 
01345     const float *fpCoeff, float *fpX, float *fpY)
01346 {
01347     int iDataCntr;
01348     stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
01349     stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
01350     stpm64 m64pX = (stpm64) fpX;
01351     stpm64 m64pY = (stpm64) fpY;
01352 
01353     X86_ASM (
01354         "movq %0, %%mm0\n\t" \
01355         "pswapd %%mm0, %%mm2\n\t" \
01356         "movd %1, %%mm3\n\t" \
01357         "movq %2, %%mm0\n\t" \
01358         "pswapd %%mm0, %%mm4\n\t" \
01359         "movq %3, %%mm5\n\t" \
01360         "movq %4, %%mm7\n\t" \
01361         :
01362         : "m" (*m64pCoeff),
01363           "m" (fpCoeff[0]),
01364           "m" (*m64pCoeff2),
01365           "m" (*m64pX),
01366           "m" (*m64pY)
01367         : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
01368     for (iDataCntr = 0; 
01369         iDataCntr < iDataLength; 
01370         iDataCntr++)
01371     {
01372         X86_ASM (
01373             "pxor %%mm0, %%mm0\n\t" \
01374             "movd %1, %%mm6\n\t" \
01375             "movq %%mm5, %%mm1\n\t" \
01376             "pfmul %%mm2, %%mm1\n\t" \
01377             "pfacc %%mm1, %%mm0\n\t" \
01378             "movq %%mm6, %%mm1\n\t" \
01379             "pfmul %%mm3, %%mm1\n\t" \
01380             "pfacc %%mm1, %%mm0\n\t" \
01381             "movq %%mm7, %%mm1\n\t" \
01382             "pfmul %%mm4, %%mm1\n\t" \
01383             "pfacc %%mm1, %%mm0\n\t" \
01384             "pfacc %%mm0, %%mm0\n\t" \
01385             \
01386             "pswapd %%mm7, %%mm1\n\t" \
01387             "movq %%mm1, %%mm7\n\t" \
01388             "punpckldq %%mm0, %%mm7\n\t" \
01389             \
01390             "pswapd %%mm5, %%mm1\n\t" \
01391             "movq %%mm1, %%mm5\n\t" \
01392             "movq %%mm6, %%mm1\n\t" \
01393             "punpckldq %%mm1, %%mm5\n\t" \
01394             \
01395             "movd %%mm0, %0\n\t"
01396             : "=m" (fpDest[iDataCntr])
01397             : "m" (fpSrc[iDataCntr])
01398             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01399     }
01400     X86_ASM (
01401         "movq %%mm5, %0\n\t" \
01402         "movd %%mm6, %1\n\t" \
01403         "movq %%mm7, %2\n\t"
01404         : "=m" (*m64pX),
01405           "=m" (fpX[2]),
01406           "=m" (*m64pY)
01407         :
01408         : "mm5", "mm6", "mm7", "memory");
01409     X86_ASM ("femms\n\t");
01410 }
01411 
01412 
01413 #ifdef __cplusplus
01414 }
01415 #endif
01416 
01417 #endif

Generated on Sun Oct 26 00:09:01 2003 for libDSP by doxygen 1.3.3