00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifdef DSP_X86
00024
00025
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include <limits.h>
00029 #include <math.h>
00030 #include <float.h>
00031
00032 #include "dsp/X86.h"
00033
00034
00035 #ifndef DSP_X86_64
00036 static char cpCPUid[13];
00037 #endif
00038
00039
00040 #ifdef __cplusplus
00041 extern "C"
00042 {
00043 #endif
00044
00045
00046 #ifndef DSP_X86_64
00047 const char *dsp_x86_cpuid ()
00048 {
00049 unsigned int *ipCPUid = (unsigned int *) cpCPUid;
00050
00051 X86_ASM (
00052 "pushl %%ebx\n\t" \
00053 "xorl %%eax, %%eax\n\t" \
00054 "cpuid\n\t" \
00055 "movl %%ebx, %0\n\t" \
00056 "movl %%ecx, %2\n\t" \
00057 "movl %%edx, %1\n\t" \
00058 "popl %%ebx\n\t" \
00059 : "=m" (ipCPUid[0]),
00060 "=m" (ipCPUid[1]),
00061 "=m" (ipCPUid[2])
00062 :
00063 : "eax", "ecx", "edx", "memory");
00064 cpCPUid[12] = '\0';
00065
00066 return cpCPUid;
00067 }
00068
00069
00070 unsigned int dsp_x86_features ()
00071 {
00072 unsigned int uiFeatures = 0;
00073
00074 X86_ASM (
00075 "pushl %%ebx\n\t" \
00076 "movl $1, %%eax\n\t" \
00077 "cpuid\n\t" \
00078 "movl %%edx, %0\n\t" \
00079 "popl %%ebx\n\t" \
00080 : "=m" (uiFeatures)
00081 :
00082 : "eax", "ecx", "edx", "memory");
00083
00084 return uiFeatures;
00085 }
00086
00087
00088 unsigned int dsp_x86_amd_features ()
00089 {
00090 unsigned int uiFunction = 0x80000001;
00091 unsigned int uiFeatures = 0;
00092
00093 X86_ASM (
00094 "pushl %%ebx\n\t" \
00095 "movl %1, %%eax\n\t" \
00096 "cpuid\n\t" \
00097 "movl %%edx, %0\n\t" \
00098 "popl %%ebx\n\t" \
00099 : "=m" (uiFeatures)
00100 : "m" (uiFunction)
00101 : "eax", "ecx", "edx", "memory");
00102
00103 return uiFeatures;
00104 }
00105 #endif
00106
00107
00108 extern int dsp_x86_have_e3dnow ()
00109 {
00110 #ifndef DSP_X86_64
00111 unsigned int uiExtSup = 0;
00112 unsigned int uiFeatures;
00113
00114 X86_ASM (
00115 "pushl %%ebx\n\t" \
00116 "movl $0x80000000, %%eax\n\t" \
00117 "cpuid\n\t" \
00118 "cmpl $0x80000001, %%eax\n\t" \
00119 "jl have3dnowxit\n\t" \
00120 "movl $1, %0\n\t" \
00121 "have3dnowxit:\n\t" \
00122 "popl %%ebx\n\t"
00123 : "=m" (uiExtSup)
00124 :
00125 : "eax", "ecx", "edx", "memory");
00126 if (uiExtSup)
00127 {
00128 uiFeatures = dsp_x86_amd_features();
00129 if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
00130 return 1;
00131 }
00132 return 0;
00133 #else
00134 return 1;
00135 #endif
00136 }
00137
00138
00139 extern int dsp_x86_have_sse2 ()
00140 {
00141 #ifndef DSP_X86_64
00142 unsigned int uiFeatures;
00143
00144 uiFeatures = dsp_x86_features();
00145 if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
00146 return 1;
00147 return 0;
00148 #else
00149 return 1;
00150 #endif
00151 }
00152
00153
00154 void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
00155 {
00156 int iStartIdx;
00157 int iDataCntr;
00158 int iDataCount;
00159 stpm64 m64pDest = (stpm64) fpDest;
00160 stpm64 m64pSrc = (stpm64) fpSrc;
00161
00162 iStartIdx = 0;
00163 X86_ASM (
00164 "prefetchnta %0\n\t" \
00165 "prefetchnta %1\n\t" \
00166 "prefetchnta %2\n\t" \
00167 "prefetchnta %3\n\t"
00168 :
00169 : "m" (m64pSrc[0]),
00170 "m" (m64pSrc[8]),
00171 "m" (m64pSrc[16]),
00172 "m" (m64pSrc[24]));
00173 iDataCount = ((iDataLength & 0xfffffff0) >> 1);
00174 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00175 {
00176 X86_ASM (
00177 "prefetchnta %16\n\t" \
00178 "movq %8, %%mm0\n\t" \
00179 "movq %9, %%mm1\n\t" \
00180 "movq %10, %%mm2\n\t" \
00181 "movq %11, %%mm3\n\t" \
00182 "movq %12, %%mm4\n\t" \
00183 "movq %13, %%mm5\n\t" \
00184 "movq %14, %%mm6\n\t" \
00185 "movq %15, %%mm7\n\t" \
00186 "movntq %%mm0, %0\n\t" \
00187 "movntq %%mm1, %1\n\t" \
00188 "movntq %%mm2, %2\n\t" \
00189 "movntq %%mm3, %3\n\t" \
00190 "movntq %%mm4, %4\n\t" \
00191 "movntq %%mm5, %5\n\t" \
00192 "movntq %%mm6, %6\n\t" \
00193 "movntq %%mm7, %7\n\t"
00194 : "=m" (m64pDest[iDataCntr]),
00195 "=m" (m64pDest[iDataCntr + 1]),
00196 "=m" (m64pDest[iDataCntr + 2]),
00197 "=m" (m64pDest[iDataCntr + 3]),
00198 "=m" (m64pDest[iDataCntr + 4]),
00199 "=m" (m64pDest[iDataCntr + 5]),
00200 "=m" (m64pDest[iDataCntr + 6]),
00201 "=m" (m64pDest[iDataCntr + 7])
00202 : "m" (m64pSrc[iDataCntr]),
00203 "m" (m64pSrc[iDataCntr + 1]),
00204 "m" (m64pSrc[iDataCntr + 2]),
00205 "m" (m64pSrc[iDataCntr + 3]),
00206 "m" (m64pSrc[iDataCntr + 4]),
00207 "m" (m64pSrc[iDataCntr + 5]),
00208 "m" (m64pSrc[iDataCntr + 6]),
00209 "m" (m64pSrc[iDataCntr + 7]),
00210 "m" (m64pSrc[iDataCntr + 32])
00211 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00212 }
00213 iStartIdx = iDataCount;
00214 iDataCount = ((iDataLength & 0xfffffffe) >> 1);
00215 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00216 {
00217 X86_ASM (
00218 "prefetchnta %2\n\t" \
00219 "movq %1, %%mm0\n\t" \
00220 "movntq %%mm0, %0\n\t"
00221 : "=m" (m64pDest[iDataCntr])
00222 : "m" (m64pSrc[iDataCntr]),
00223 "m" (m64pSrc[iDataCntr + 32])
00224 : "mm0", "memory");
00225 }
00226 if (iDataLength & 0x1)
00227 {
00228 X86_ASM (
00229 "movd %1, %%mm0\n\t" \
00230 "movd %%mm0, %0\n\t"
00231 : "=m" (fpDest[iDataLength - 1])
00232 : "m" (fpSrc[iDataLength - 1])
00233 : "mm0", "memory");
00234 }
00235 X86_ASM (
00236 "femms\n\t" \
00237 "sfence\n\t");
00238 }
00239
00240
00241 void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
00242 {
00243 int iStartIdx;
00244 int iDataCntr;
00245 int iDataCount;
00246
00247 iStartIdx = 0;
00248 X86_ASM (
00249 "prefetchnta %0\n\t" \
00250 "prefetchnta %1\n\t" \
00251 "prefetchnta %2\n\t" \
00252 "prefetchnta %3\n\t"
00253 :
00254 : "m" (dpSrc[0]),
00255 "m" (dpSrc[8]),
00256 "m" (dpSrc[16]),
00257 "m" (dpSrc[24]));
00258 iDataCount = (iDataLength & 0xfffffff8);
00259 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00260 {
00261 X86_ASM (
00262 "prefetchnta %16\n\t" \
00263 "movq %8, %%mm0\n\t" \
00264 "movq %9, %%mm1\n\t" \
00265 "movq %10, %%mm2\n\t" \
00266 "movq %11, %%mm3\n\t" \
00267 "movq %12, %%mm4\n\t" \
00268 "movq %13, %%mm5\n\t" \
00269 "movq %14, %%mm6\n\t" \
00270 "movq %15, %%mm7\n\t" \
00271 "movntq %%mm0, %0\n\t" \
00272 "movntq %%mm1, %1\n\t" \
00273 "movntq %%mm2, %2\n\t" \
00274 "movntq %%mm3, %3\n\t" \
00275 "movntq %%mm4, %4\n\t" \
00276 "movntq %%mm5, %5\n\t" \
00277 "movntq %%mm6, %6\n\t" \
00278 "movntq %%mm7, %7\n\t"
00279 : "=m" (dpDest[iDataCntr]),
00280 "=m" (dpDest[iDataCntr + 1]),
00281 "=m" (dpDest[iDataCntr + 2]),
00282 "=m" (dpDest[iDataCntr + 3]),
00283 "=m" (dpDest[iDataCntr + 4]),
00284 "=m" (dpDest[iDataCntr + 5]),
00285 "=m" (dpDest[iDataCntr + 6]),
00286 "=m" (dpDest[iDataCntr + 7])
00287 : "m" (dpSrc[iDataCntr]),
00288 "m" (dpSrc[iDataCntr + 1]),
00289 "m" (dpSrc[iDataCntr + 2]),
00290 "m" (dpSrc[iDataCntr + 3]),
00291 "m" (dpSrc[iDataCntr + 4]),
00292 "m" (dpSrc[iDataCntr + 5]),
00293 "m" (dpSrc[iDataCntr + 6]),
00294 "m" (dpSrc[iDataCntr + 7]),
00295 "m" (dpSrc[iDataCntr + 32])
00296 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00297 }
00298 iStartIdx = iDataCount;
00299 iDataCount = iDataLength;
00300 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00301 {
00302 X86_ASM (
00303 "prefetchnta %2\n\t" \
00304 "movq %1, %%mm0\n\t" \
00305 "movntq %%mm0, %0\n\t"
00306 : "=m" (dpDest[iDataCntr])
00307 : "m" (dpSrc[iDataCntr]),
00308 "m" (dpSrc[iDataCntr + 32])
00309 : "mm0", "memory");
00310 }
00311 X86_ASM (
00312 "femms\n\t" \
00313 "sfence\n\t");
00314 }
00315
00316
00317 void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
00318 {
00319 int iDataCntr;
00320 int iDataCount;
00321 stpm64 m64pVect = (stpm64) fpVect;
00322 stm64 m64Src;
00323
00324 m64Src.f[0] = m64Src.f[1] = fSrc;
00325 iDataCount = (iDataLength >> 1);
00326 X86_ASM (
00327 "movq %0, %%mm1\n\t"
00328 :
00329 : "m" (m64Src)
00330 : "mm1", "memory");
00331 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00332 {
00333 X86_ASM (
00334 "movq %1, %%mm0\n\t" \
00335 "pfadd %%mm1, %%mm0\n\t" \
00336 "movntq %%mm0, %0\n\t"
00337 : "=m" (m64pVect[iDataCntr])
00338 : "m0" (m64pVect[iDataCntr])
00339 : "mm0", "mm1", "memory");
00340 }
00341 if (iDataLength & 0x1)
00342 {
00343 X86_ASM (
00344 "movd %1, %%mm0\n\t" \
00345 "pfadd %%mm1, %%mm0\n\t" \
00346 "movd %%mm0, %0\n\t"
00347 : "=m" (fpVect[iDataLength - 1])
00348 : "m0" (fpVect[iDataLength - 1])
00349 : "mm0", "mm1", "memory");
00350 }
00351 X86_ASM (
00352 "femms\n\t" \
00353 "sfence\n\t");
00354 }
00355
00356
00357 void dsp_x86_sse_addf (float *fpVect, float fSrc, int iDataLength)
00358 {
00359 int iDataCntr;
00360
00361 X86_ASM (
00362 "movss %0, %%xmm1\n\t"
00363 :
00364 : "m" (fSrc)
00365 : "xmm1", "memory");
00366 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00367 {
00368 X86_ASM (
00369 "movss %1, %%xmm0\n\t" \
00370 "addss %%xmm1, %%xmm0\n\t" \
00371 "movss %%xmm0, %0\n\t"
00372 : "=m" (fpVect[iDataCntr])
00373 : "m0" (fpVect[iDataCntr])
00374 : "xmm0", "xmm1", "memory");
00375 }
00376 }
00377
00378
00379 void dsp_x86_sse_add (double *dpVect, double dSrc, int iDataLength)
00380 {
00381 int iDataCntr;
00382
00383 X86_ASM (
00384 "movsd %0, %%xmm1\n\t"
00385 :
00386 : "m" (dSrc)
00387 : "xmm1", "memory");
00388 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00389 {
00390 X86_ASM (
00391 "movsd %1, %%xmm0\n\t" \
00392 "addsd %%xmm1, %%xmm0\n\t" \
00393 "movsd %%xmm0, %0\n\t"
00394 : "=m" (dpVect[iDataCntr])
00395 : "m0" (dpVect[iDataCntr])
00396 : "xmm0", "xmm1", "memory");
00397 }
00398 }
00399
00400
00401 void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
00402 {
00403 int iDataCntr;
00404 int iDataCount;
00405 stpm64 m64pVect = (stpm64) fpVect;
00406 stm64 m64Src;
00407
00408 m64Src.f[0] = m64Src.f[1] = fSrc;
00409 iDataCount = (iDataLength >> 1);
00410 X86_ASM (
00411 "movq %0, %%mm1\n\t"
00412 :
00413 : "m" (m64Src)
00414 : "mm1", "memory");
00415 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00416 {
00417 X86_ASM (
00418 "movq %1, %%mm0\n\t" \
00419 "pfmul %%mm1, %%mm0\n\t" \
00420 "movntq %%mm0, %0\n\t"
00421 : "=m" (m64pVect[iDataCntr])
00422 : "m0" (m64pVect[iDataCntr])
00423 : "mm0", "mm1", "memory");
00424 }
00425 if (iDataLength & 0x1)
00426 {
00427 X86_ASM (
00428 "movd %1, %%mm0\n\t" \
00429 "pfmul %%mm1, %%mm0\n\t" \
00430 "movd %%mm0, %0\n\t"
00431 : "=m" (fpVect[iDataLength - 1])
00432 : "m0" (fpVect[iDataLength - 1])
00433 : "mm0", "mm1", "memory");
00434 }
00435 X86_ASM (
00436 "femms\n\t" \
00437 "sfence\n\t");
00438 }
00439
00440
00441 void dsp_x86_sse_mulf (float *fpVect, float fSrc, int iDataLength)
00442 {
00443 int iDataCntr;
00444
00445 X86_ASM (
00446 "movss %0, %%xmm1\n\t"
00447 :
00448 : "m" (fSrc)
00449 : "xmm1", "memory");
00450 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00451 {
00452 X86_ASM (
00453 "movss %1, %%xmm0\n\t" \
00454 "mulss %%xmm1, %%xmm0\n\t" \
00455 "movss %%xmm0, %0\n\t"
00456 : "=m" (fpVect[iDataCntr])
00457 : "m0" (fpVect[iDataCntr])
00458 : "xmm0", "xmm1", "memory");
00459 }
00460 }
00461
00462
00463 void dsp_x86_sse_mul (double *dpVect, double dSrc, int iDataLength)
00464 {
00465 int iDataCntr;
00466
00467 X86_ASM (
00468 "movsd %0, %%xmm1\n\t"
00469 :
00470 : "m" (dSrc)
00471 : "xmm1", "memory");
00472 for (iDataCntr = 0; iDataCntr <iDataLength; iDataCntr++)
00473 {
00474 X86_ASM (
00475 "movsd %1, %%xmm0\n\t" \
00476 "mulsd %%xmm1, %%xmm0\n\t" \
00477 "movsd %%xmm0, %0\n\t"
00478 : "=m" (dpVect[iDataCntr])
00479 : "m0" (dpVect[iDataCntr])
00480 : "xmm0", "xmm1", "memory");
00481 }
00482 }
00483
00484
00485 void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
00486 int iDataLength)
00487 {
00488 int iDataCntr;
00489 int iDataCount;
00490 stpm64 m64pDest = (stpm64) fpDest;
00491 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00492 stm64 m64Src2;
00493
00494 m64Src2.f[0] = m64Src2.f[1] = fSrc2;
00495 iDataCount = (iDataLength >> 1);
00496 X86_ASM (
00497 "movq %0, %%mm1\n\t"
00498 :
00499 : "m" (m64Src2)
00500 : "mm1", "memory");
00501 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00502 {
00503 X86_ASM (
00504 "movq %1, %%mm0\n\t" \
00505 "pfmul %%mm1, %%mm0\n\t" \
00506 "movntq %%mm0, %0\n\t"
00507 : "=m" (m64pDest[iDataCntr])
00508 : "m" (m64pSrc1[iDataCntr])
00509 : "mm0", "mm1", "memory");
00510 }
00511 if (iDataLength & 0x1)
00512 {
00513 X86_ASM (
00514 "movd %1, %%mm0\n\t" \
00515 "pfmul %%mm1, %%mm0\n\t" \
00516 "movd %%mm0, %0\n\t"
00517 : "=m" (fpDest[iDataLength - 1])
00518 : "m" (fpSrc1[iDataLength - 1])
00519 : "mm0", "mm1", "memory");
00520 }
00521 X86_ASM (
00522 "femms\n\t" \
00523 "sfence\n\t");
00524 }
00525
00526
00527 void dsp_x86_sse_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
00528 int iDataLength)
00529 {
00530 int iDataCntr;
00531
00532 X86_ASM (
00533 "movss %0, %%xmm1\n\t"
00534 :
00535 : "m" (fSrc2)
00536 : "xmm1", "memory");
00537 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00538 {
00539 X86_ASM (
00540 "movss %1, %%xmm0\n\t" \
00541 "mulss %%xmm1, %%xmm0\n\t" \
00542 "movss %%xmm0, %0\n\t"
00543 : "=m" (fpDest[iDataCntr])
00544 : "m" (fpSrc1[iDataCntr])
00545 : "xmm0", "xmm1", "memory");
00546 }
00547 }
00548
00549
00550 void dsp_x86_sse_mul_nip (double *dpDest, const double *dpSrc1, double dSrc2,
00551 int iDataLength)
00552 {
00553 int iDataCntr;
00554
00555 X86_ASM (
00556 "movsd %0, %%xmm1\n\t"
00557 :
00558 : "m" (dSrc2)
00559 : "xmm1", "memory");
00560 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00561 {
00562 X86_ASM (
00563 "movsd %1, %%xmm0\n\t" \
00564 "mulsd %%xmm1, %%xmm0\n\t" \
00565 "movsd %%xmm0, %0\n\t"
00566 : "=m" (dpDest[iDataCntr])
00567 : "m" (dpSrc1[iDataCntr])
00568 : "xmm0", "xmm1", "memory");
00569 }
00570 }
00571
00572
00573 void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00574 {
00575 int iDataCntr;
00576 int iDataCount;
00577 stpm64 m64pDest = (stpm64) fpDest;
00578 stpm64 m64pSrc = (stpm64) fpSrc;
00579
00580 iDataCount = (iDataLength >> 1);
00581 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00582 {
00583 X86_ASM (
00584 "movq %1, %%mm0\n\t" \
00585 "movq %2, %%mm1\n\t" \
00586 "pfadd %%mm1, %%mm0\n\t" \
00587 "movntq %%mm0, %0\n\t"
00588 : "=m" (m64pDest[iDataCntr])
00589 : "m0" (m64pDest[iDataCntr]),
00590 "m" (m64pSrc[iDataCntr])
00591 : "mm0", "mm1", "memory");
00592 }
00593 if (iDataLength & 0x1)
00594 {
00595 X86_ASM (
00596 "movd %1, %%mm0\n\t" \
00597 "movd %2, %%mm1\n\t" \
00598 "pfadd %%mm1, %%mm0\n\t" \
00599 "movd %%mm0, %0\n\t"
00600 : "=m" (fpDest[iDataLength - 1])
00601 : "m0" (fpDest[iDataLength - 1]),
00602 "m" (fpSrc[iDataLength - 1])
00603 : "mm0", "mm1", "memory");
00604 }
00605 X86_ASM (
00606 "femms\n\t" \
00607 "sfence\n\t");
00608 }
00609
00610
00611 void dsp_x86_sse_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00612 {
00613 int iDataCntr;
00614
00615 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00616 {
00617 X86_ASM (
00618 "movss %1, %%xmm0\n\t" \
00619 "addss %2, %%xmm0\n\t" \
00620 "movss %%xmm0, %0\n\t"
00621 : "=m" (fpDest[iDataCntr])
00622 : "m0" (fpDest[iDataCntr]),
00623 "m" (fpSrc[iDataCntr])
00624 : "xmm0", "memory");
00625 }
00626 }
00627
00628
00629 void dsp_x86_sse_add2 (double *dpDest, const double *dpSrc, int iDataLength)
00630 {
00631 int iDataCntr;
00632
00633 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00634 {
00635 X86_ASM (
00636 "movsd %1, %%xmm0\n\t" \
00637 "addsd %2, %%xmm0\n\t" \
00638 "movsd %%xmm0, %0\n\t"
00639 : "=m" (dpDest[iDataCntr])
00640 : "m0" (dpDest[iDataCntr]),
00641 "m" (dpSrc[iDataCntr])
00642 : "xmm0", "memory");
00643 }
00644 }
00645
00646
00647 void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00648 {
00649 int iDataCntr;
00650 int iDataCount;
00651 stpm64 m64pDest = (stpm64) fpDest;
00652 stpm64 m64pSrc = (stpm64) fpSrc;
00653
00654 iDataCount = (iDataLength >> 1);
00655 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00656 {
00657 X86_ASM (
00658 "movq %1, %%mm0\n\t" \
00659 "movq %2, %%mm1\n\t" \
00660 "pfmul %%mm1, %%mm0\n\t" \
00661 "movntq %%mm0, %0\n\t"
00662 : "=m" (m64pDest[iDataCntr])
00663 : "m0" (m64pDest[iDataCntr]),
00664 "m" (m64pSrc[iDataCntr])
00665 : "mm0", "mm1", "memory");
00666 }
00667 if (iDataLength & 0x1)
00668 {
00669 X86_ASM (
00670 "movd %1, %%mm0\n\t" \
00671 "movd %2, %%mm1\n\t" \
00672 "pfmul %%mm1, %%mm0\n\t" \
00673 "movd %%mm0, %0\n\t"
00674 : "=m" (fpDest[iDataLength - 1])
00675 : "m0" (fpDest[iDataLength - 1]),
00676 "m" (fpSrc[iDataLength - 1])
00677 : "mm0", "mm1", "memory");
00678 }
00679 X86_ASM (
00680 "femms\n\t" \
00681 "sfence\n\t");
00682 }
00683
00684
00685 void dsp_x86_sse_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00686 {
00687 int iDataCntr;
00688
00689 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00690 {
00691 X86_ASM (
00692 "movss %1, %%xmm0\n\t" \
00693 "mulss %2, %%xmm0\n\t" \
00694 "movss %%xmm0, %0\n\t"
00695 : "=m" (fpDest[iDataCntr])
00696 : "m0" (fpDest[iDataCntr]),
00697 "m" (fpSrc[iDataCntr])
00698 : "xmm0", "memory");
00699 }
00700 }
00701
00702
00703 void dsp_x86_sse_mul2 (double *dpDest, const double *dpSrc, int iDataLength)
00704 {
00705 int iDataCntr;
00706
00707 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00708 {
00709 X86_ASM (
00710 "movsd %1, %%xmm0\n\t" \
00711 "mulsd %2, %%xmm0\n\t" \
00712 "movsd %%xmm0, %0\n\t"
00713 : "=m" (dpDest[iDataCntr])
00714 : "m0" (dpDest[iDataCntr]),
00715 "m" (dpSrc[iDataCntr])
00716 : "xmm0", "memory");
00717 }
00718 }
00719
00720
00721 void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1,
00722 const float *fpSrc2, int iDataLength)
00723 {
00724 int iDataCntr;
00725 int iDataCount;
00726 stpm64 m64pDest = (stpm64) fpDest;
00727 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00728 stpm64 m64pSrc2 = (stpm64) fpSrc2;
00729
00730 iDataCount = (iDataLength >> 1);
00731 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00732 {
00733 X86_ASM (
00734 "movq %1, %%mm0\n\t" \
00735 "movq %2, %%mm1\n\t" \
00736 "pfadd %%mm1, %%mm0\n\t" \
00737 "movntq %%mm0, %0\n\t"
00738 : "=m" (m64pDest[iDataCntr])
00739 : "m" (m64pSrc1[iDataCntr]),
00740 "m" (m64pSrc2[iDataCntr])
00741 : "mm0", "mm1", "memory");
00742 }
00743 if (iDataLength & 0x1)
00744 {
00745 X86_ASM (
00746 "movd %1, %%mm0\n\t" \
00747 "movd %2, %%mm1\n\t" \
00748 "pfadd %%mm1, %%mm0\n\t" \
00749 "movd %%mm0, %0\n\t"
00750 : "=m" (fpDest[iDataLength - 1])
00751 : "m" (fpSrc1[iDataLength - 1]),
00752 "m" (fpSrc2[iDataLength - 1])
00753 : "mm0", "mm1", "memory");
00754 }
00755 X86_ASM (
00756 "femms\n\t" \
00757 "sfence\n\t");
00758 }
00759
00760
00761 void dsp_x86_sse_add3f (float *fpDest, const float *fpSrc1,
00762 const float *fpSrc2, int iDataLength)
00763 {
00764 int iDataCntr;
00765
00766 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00767 {
00768 X86_ASM (
00769 "movss %1, %%xmm0\n\t" \
00770 "addss %2, %%xmm0\n\t" \
00771 "movss %%xmm0, %0\n\t"
00772 : "=m" (fpDest[iDataCntr])
00773 : "m" (fpSrc1[iDataCntr]),
00774 "m" (fpSrc2[iDataCntr])
00775 : "xmm0", "memory");
00776 }
00777 }
00778
00779
00780 void dsp_x86_sse_add3 (double *dpDest, const double *dpSrc1,
00781 const double *dpSrc2, int iDataLength)
00782 {
00783 int iDataCntr;
00784
00785 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00786 {
00787 X86_ASM (
00788 "movsd %1, %%xmm0\n\t" \
00789 "addsd %2, %%xmm0\n\t" \
00790 "movsd %%xmm0, %0\n\t"
00791 : "=m" (dpDest[iDataCntr])
00792 : "m" (dpSrc1[iDataCntr]),
00793 "m" (dpSrc2[iDataCntr])
00794 : "xmm0", "memory");
00795 }
00796 }
00797
00798
00799 void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1,
00800 const float *fpSrc2, int iDataLength)
00801 {
00802 int iDataCntr;
00803 int iDataCount;
00804 stpm64 m64pDest = (stpm64) fpDest;
00805 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00806 stpm64 m64pSrc2 = (stpm64) fpSrc2;
00807
00808 iDataCount = (iDataLength >> 1);
00809 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00810 {
00811 X86_ASM (
00812 "movq %1, %%mm0\n\t" \
00813 "movq %2, %%mm1\n\t" \
00814 "pfmul %%mm1, %%mm0\n\t" \
00815 "movntq %%mm0, %0\n\t"
00816 : "=m" (m64pDest[iDataCntr])
00817 : "m" (m64pSrc1[iDataCntr]),
00818 "m" (m64pSrc2[iDataCntr])
00819 : "mm0", "mm1", "memory");
00820 }
00821 if (iDataLength & 0x1)
00822 {
00823 X86_ASM (
00824 "movd %1, %%mm0\n\t" \
00825 "movd %2, %%mm1\n\t" \
00826 "pfmul %%mm1, %%mm0\n\t" \
00827 "movd %%mm0, %0\n\t"
00828 : "=m" (fpDest[iDataLength - 1])
00829 : "m" (fpSrc1[iDataLength - 1]),
00830 "m" (fpSrc2[iDataLength - 1])
00831 : "mm0", "mm1", "memory");
00832 }
00833 X86_ASM (
00834 "femms\n\t" \
00835 "sfence\n\t");
00836 }
00837
00838
00839 void dsp_x86_sse_mul3f (float *fpDest, const float *fpSrc1,
00840 const float *fpSrc2, int iDataLength)
00841 {
00842 int iDataCntr;
00843
00844 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00845 {
00846 X86_ASM (
00847 "movss %1, %%xmm0\n\t" \
00848 "mulss %2, %%xmm0\n\t" \
00849 "movss %%xmm0, %0\n\t"
00850 : "=m" (fpDest[iDataCntr])
00851 : "m" (fpSrc1[iDataCntr]),
00852 "m" (fpSrc2[iDataCntr])
00853 : "xmm0", "memory");
00854 }
00855 }
00856
00857
00858 void dsp_x86_sse_mul3 (double *dpDest, const double *dpSrc1,
00859 const double *dpSrc2, int iDataLength)
00860 {
00861 int iDataCntr;
00862
00863 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00864 {
00865 X86_ASM (
00866 "movsd %1, %%xmm0\n\t" \
00867 "mulsd %2, %%xmm0\n\t" \
00868 "movsd %%xmm0, %0\n\t"
00869 : "=m" (dpDest[iDataCntr])
00870 : "m" (dpSrc1[iDataCntr]),
00871 "m" (dpSrc2[iDataCntr])
00872 : "xmm0", "memory");
00873 }
00874 }
00875
00876
00877 void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00878 {
00879 int iDataCntr;
00880 stpm64 m64pDest = (stpm64) fpDest;
00881
00882 X86_ASM (
00883 "movq %0, %%mm3\n\t"
00884 :
00885 : "m" (fpSrc[0])
00886 : "mm3", "memory");
00887 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00888 {
00889 X86_ASM (
00890 "movq %1, %%mm0\n\t" \
00891 "movq %%mm3, %%mm1\n\t" \
00892 "pswapd %%mm1, %%mm2\n\t" \
00893 "pfmul %%mm0, %%mm1\n\t" \
00894 "pfmul %%mm0, %%mm2\n\t" \
00895 "pfpnacc %%mm2, %%mm1\n\t"
00896 "movntq %%mm1, %0\n\t"
00897 : "=m" (m64pDest[iDataCntr])
00898 : "m0" (m64pDest[iDataCntr])
00899 : "mm0", "mm1", "mm2", "mm3", "memory");
00900 }
00901 X86_ASM (
00902 "femms\n\t" \
00903 "sfence\n\t");
00904 }
00905
00906
00907 void dsp_x86_sse_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00908 {
00909 int iDataCntr;
00910 int iDataCount;
00911
00912 X86_ASM (
00913 "movss %0, %%xmm2\n\t" \
00914 "movss %1, %%xmm3\n\t"
00915 :
00916 : "m" (fpSrc[0]),
00917 "m" (fpSrc[1])
00918 : "xmm2", "xmm3", "memory");
00919 iDataCount = (iDataLength << 1);
00920 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
00921 {
00922 X86_ASM (
00923 "movss %2, %%xmm0\n\t" \
00924 "movss %%xmm0, %%xmm1\n\t" \
00925 "movss %3, %%xmm4\n\t" \
00926 \
00927 "mulss %%xmm2, %%xmm0\n\t" \
00928 "movss %%xmm4, %%xmm5\n\t" \
00929 "mulss %%xmm3, %%xmm5\n\t" \
00930 "subss %%xmm0, %%xmm5\n\t" \
00931 \
00932 "mulss %%xmm3, %%xmm1\n\t" \
00933 "movss %%xmm4, %%xmm5\n\t" \
00934 "mulss %%xmm2, %%xmm5\n\t" \
00935 "addss %%xmm5, %%xmm1\n\t" \
00936 \
00937 "movss %%xmm0, %0\n\t" \
00938 "movss %%xmm1, %1\n\t"
00939 : "=m" (fpDest[iDataCntr]),
00940 "=m" (fpDest[iDataCntr + 1])
00941 : "m0" (fpDest[iDataCntr]),
00942 "m1" (fpDest[iDataCntr + 1])
00943 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
00944 }
00945 }
00946
00947
00948 void dsp_x86_sse_cmul (double *dpDest, const double *dpSrc, int iDataLength)
00949 {
00950 int iDataCntr;
00951 int iDataCount;
00952
00953 X86_ASM (
00954 "movsd %0, %%xmm2\n\t" \
00955 "movsd %1, %%xmm3\n\t"
00956 :
00957 : "m" (dpSrc[0]),
00958 "m" (dpSrc[1])
00959 : "xmm2", "xmm3", "memory");
00960 iDataCount = (iDataLength << 1);
00961 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
00962 {
00963 X86_ASM (
00964 "movsd %2, %%xmm0\n\t" \
00965 "movsd %%xmm0, %%xmm1\n\t" \
00966 "movsd %3, %%xmm4\n\t" \
00967 \
00968 "mulsd %%xmm2, %%xmm0\n\t" \
00969 "movsd %%xmm4, %%xmm5\n\t" \
00970 "mulsd %%xmm3, %%xmm5\n\t" \
00971 "subsd %%xmm0, %%xmm5\n\t" \
00972 \
00973 "mulsd %%xmm3, %%xmm1\n\t" \
00974 "movsd %%xmm4, %%xmm5\n\t" \
00975 "mulsd %%xmm2, %%xmm5\n\t" \
00976 "addsd %%xmm5, %%xmm1\n\t" \
00977 \
00978 "movsd %%xmm0, %0\n\t" \
00979 "movsd %%xmm1, %1\n\t"
00980 : "=m" (dpDest[iDataCntr]),
00981 "=m" (dpDest[iDataCntr + 1])
00982 : "m0" (dpDest[iDataCntr]),
00983 "m1" (dpDest[iDataCntr + 1])
00984 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
00985 }
00986 }
00987
00988
00989 void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
00990 {
00991 int iDataCntr;
00992 stpm64 m64pDest = (stpm64) fpDest;
00993 stpm64 m64pSrc = (stpm64) fpSrc;
00994
00995 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00996 {
00997 X86_ASM (
00998 "movq %1, %%mm0\n\t" \
00999 "movq %2, %%mm1\n\t" \
01000 "pswapd %%mm1, %%mm2\n\t" \
01001 "pfmul %%mm0, %%mm1\n\t" \
01002 "pfmul %%mm0, %%mm2\n\t" \
01003 "pfpnacc %%mm2, %%mm1\n\t"
01004 "movntq %%mm1, %0\n\t"
01005 : "=m" (m64pDest[iDataCntr])
01006 : "m0" (m64pDest[iDataCntr]),
01007 "m" (m64pSrc[iDataCntr])
01008 : "mm0", "mm1", "mm2", "memory");
01009 }
01010 X86_ASM (
01011 "femms\n\t" \
01012 "sfence\n\t");
01013 }
01014
01015
01016 void dsp_x86_sse_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
01017 {
01018 int iDataCntr;
01019 int iDataCount;
01020
01021 iDataCount = (iDataLength << 1);
01022 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01023 {
01024 X86_ASM (
01025 "movss %4, %%xmm2\n\t" \
01026 "movss %5, %%xmm3\n\t" \
01027 \
01028 "movss %2, %%xmm0\n\t" \
01029 "movss %%xmm0, %%xmm1\n\t" \
01030 "movss %3, %%xmm4\n\t" \
01031 \
01032 "mulss %%xmm2, %%xmm0\n\t" \
01033 "movss %%xmm4, %%xmm5\n\t" \
01034 "mulss %%xmm3, %%xmm5\n\t" \
01035 "subss %%xmm0, %%xmm5\n\t" \
01036 \
01037 "mulss %%xmm3, %%xmm1\n\t" \
01038 "movss %%xmm4, %%xmm5\n\t" \
01039 "mulss %%xmm2, %%xmm5\n\t" \
01040 "addss %%xmm5, %%xmm1\n\t" \
01041 \
01042 "movss %%xmm0, %0\n\t" \
01043 "movss %%xmm1, %1\n\t"
01044 : "=m" (fpDest[iDataCntr]),
01045 "=m" (fpDest[iDataCntr + 1])
01046 : "m0" (fpDest[iDataCntr]),
01047 "m1" (fpDest[iDataCntr + 1]),
01048 "m" (fpSrc[iDataCntr]),
01049 "m" (fpSrc[iDataCntr + 1])
01050 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01051 }
01052 }
01053
01054
01055 void dsp_x86_sse_cmul2 (double *dpDest, const double *dpSrc, int iDataLength)
01056 {
01057 int iDataCntr;
01058 int iDataCount;
01059
01060 iDataCount = (iDataLength << 1);
01061 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01062 {
01063 X86_ASM (
01064 "movsd %4, %%xmm2\n\t" \
01065 "movsd %5, %%xmm3\n\t" \
01066 \
01067 "movsd %2, %%xmm0\n\t" \
01068 "movsd %%xmm0, %%xmm1\n\t" \
01069 "movsd %3, %%xmm4\n\t" \
01070 \
01071 "mulsd %%xmm2, %%xmm0\n\t" \
01072 "movsd %%xmm4, %%xmm5\n\t" \
01073 "mulsd %%xmm3, %%xmm5\n\t" \
01074 "subsd %%xmm0, %%xmm5\n\t" \
01075 \
01076 "mulsd %%xmm3, %%xmm1\n\t" \
01077 "movsd %%xmm4, %%xmm5\n\t" \
01078 "mulsd %%xmm2, %%xmm5\n\t" \
01079 "addsd %%xmm5, %%xmm1\n\t" \
01080 \
01081 "movsd %%xmm0, %0\n\t" \
01082 "movsd %%xmm1, %1\n\t"
01083 : "=m" (dpDest[iDataCntr]),
01084 "=m" (dpDest[iDataCntr + 1])
01085 : "m0" (dpDest[iDataCntr]),
01086 "m1" (dpDest[iDataCntr + 1]),
01087 "m" (dpSrc[iDataCntr]),
01088 "m" (dpSrc[iDataCntr + 1])
01089 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01090 }
01091 }
01092
01093
01094 void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1,
01095 const float *fpSrc2, int iDataLength)
01096 {
01097 int iDataCntr;
01098 stpm64 m64pDest = (stpm64) fpDest;
01099 stpm64 m64pSrc1 = (stpm64) fpSrc1;
01100 stpm64 m64pSrc2 = (stpm64) fpSrc2;
01101
01102 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01103 {
01104 X86_ASM (
01105 "movq %1, %%mm0\n\t" \
01106 "movq %2, %%mm1\n\t" \
01107 "pswapd %%mm1, %%mm2\n\t" \
01108 "pfmul %%mm0, %%mm1\n\t" \
01109 "pfmul %%mm0, %%mm2\n\t" \
01110 "pfpnacc %%mm2, %%mm1\n\t"
01111 "movntq %%mm1, %0\n\t"
01112 : "=m" (m64pDest[iDataCntr])
01113 : "m" (m64pSrc1[iDataCntr]),
01114 "m" (m64pSrc2[iDataCntr])
01115 : "mm0", "mm1", "mm2", "memory");
01116 }
01117 X86_ASM (
01118 "femms\n\t" \
01119 "sfence\n\t");
01120 }
01121
01122
01123 void dsp_x86_sse_cmul3f (float *fpDest, const float *fpSrc1,
01124 const float *fpSrc2, int iDataLength)
01125 {
01126 int iDataCntr;
01127 int iDataCount;
01128
01129 iDataCount = (iDataLength << 1);
01130 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01131 {
01132 X86_ASM (
01133 "movss %4, %%xmm2\n\t" \
01134 "movss %5, %%xmm3\n\t" \
01135 \
01136 "movss %2, %%xmm0\n\t" \
01137 "movss %%xmm0, %%xmm1\n\t" \
01138 "movss %3, %%xmm4\n\t" \
01139 \
01140 "mulss %%xmm2, %%xmm0\n\t" \
01141 "movss %%xmm4, %%xmm5\n\t" \
01142 "mulss %%xmm3, %%xmm5\n\t" \
01143 "subss %%xmm0, %%xmm5\n\t" \
01144 \
01145 "mulss %%xmm3, %%xmm1\n\t" \
01146 "movss %%xmm4, %%xmm5\n\t" \
01147 "mulss %%xmm2, %%xmm5\n\t" \
01148 "addss %%xmm5, %%xmm1\n\t" \
01149 \
01150 "movss %%xmm0, %0\n\t" \
01151 "movss %%xmm1, %1\n\t"
01152 : "=m" (fpDest[iDataCntr]),
01153 "=m" (fpDest[iDataCntr + 1])
01154 : "m" (fpSrc1[iDataCntr]),
01155 "m" (fpSrc1[iDataCntr + 1]),
01156 "m" (fpSrc2[iDataCntr]),
01157 "m" (fpSrc2[iDataCntr + 1])
01158 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01159 }
01160 }
01161
01162
01163 void dsp_x86_sse_cmul3 (double *dpDest, const double *dpSrc1,
01164 const double *dpSrc2, int iDataLength)
01165 {
01166 int iDataCntr;
01167 int iDataCount;
01168
01169 iDataCount = (iDataLength << 1);
01170 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01171 {
01172 X86_ASM (
01173 "movsd %4, %%xmm2\n\t" \
01174 "movsd %5, %%xmm3\n\t" \
01175 \
01176 "movsd %2, %%xmm0\n\t" \
01177 "movsd %%xmm0, %%xmm1\n\t" \
01178 "movsd %3, %%xmm4\n\t" \
01179 \
01180 "mulsd %%xmm2, %%xmm0\n\t" \
01181 "movsd %%xmm4, %%xmm5\n\t" \
01182 "mulsd %%xmm3, %%xmm5\n\t" \
01183 "subsd %%xmm0, %%xmm5\n\t" \
01184 \
01185 "mulsd %%xmm3, %%xmm1\n\t" \
01186 "movsd %%xmm4, %%xmm5\n\t" \
01187 "mulsd %%xmm2, %%xmm5\n\t" \
01188 "addsd %%xmm5, %%xmm1\n\t" \
01189 \
01190 "movsd %%xmm0, %0\n\t" \
01191 "movsd %%xmm1, %1\n\t"
01192 : "=m" (dpDest[iDataCntr]),
01193 "=m" (dpDest[iDataCntr + 1])
01194 : "m" (dpSrc1[iDataCntr]),
01195 "m" (dpSrc1[iDataCntr + 1]),
01196 "m" (dpSrc2[iDataCntr]),
01197 "m" (dpSrc2[iDataCntr + 1])
01198 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01199 }
01200 }
01201
01202
01203 void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
01204 {
01205 int iDataCntr;
01206 int iDataCount;
01207 stpm64 m64pVect = (stpm64) fpVect;
01208 stm64 m64Mul;
01209 stm64 m64Add;
01210
01211 m64Mul.f[0] = m64Mul.f[1] = fMul;
01212 m64Add.f[0] = m64Add.f[1] = fAdd;
01213 iDataCount = (iDataLength >> 1);
01214 X86_ASM (
01215 "movq %0, %%mm1\n\t" \
01216 "movq %1, %%mm2\n\t"
01217 :
01218 : "m" (m64Mul),
01219 "m" (m64Add)
01220 : "mm1", "mm2", "memory");
01221 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01222 {
01223 X86_ASM (
01224 "movq %1, %%mm0\n\t" \
01225 "pfmul %%mm1, %%mm0\n\t" \
01226 "pfadd %%mm2, %%mm0\n\t" \
01227 "movntq %%mm0, %0\n\t"
01228 : "=m" (m64pVect[iDataCntr])
01229 : "m0" (m64pVect[iDataCntr])
01230 : "mm0", "mm1", "mm2", "memory");
01231 }
01232 if (iDataLength & 0x1)
01233 {
01234 X86_ASM (
01235 "movd %1, %%mm0\n\t" \
01236 "pfmul %%mm1, %%mm0\n\t" \
01237 "pfadd %%mm2, %%mm0\n\t" \
01238 "movd %%mm0, %0\n\t"
01239 : "=m" (fpVect[iDataLength - 1])
01240 : "m0" (fpVect[iDataLength - 1])
01241 : "mm0", "mm1", "mm2", "memory");
01242 }
01243 X86_ASM (
01244 "femms\n\t" \
01245 "sfence\n\t");
01246 }
01247
01248
01249 void dsp_x86_sse_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
01250 {
01251 int iDataCntr;
01252
01253 X86_ASM (
01254 "movss %0, %%xmm1\n\t" \
01255 "movss %1, %%xmm2\n\t"
01256 :
01257 : "m" (fMul),
01258 "m" (fAdd)
01259 : "xmm1", "xmm2", "memory");
01260 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01261 {
01262 X86_ASM (
01263 "movss %1, %%xmm0\n\t" \
01264 "mulss %%xmm1, %%xmm0\n\t" \
01265 "addss %%xmm2, %%xmm0\n\t" \
01266 "movss %%xmm0, %0\n\t"
01267 : "=m" (fpVect[iDataCntr])
01268 : "m0" (fpVect[iDataCntr])
01269 : "xmm0", "xmm1", "xmm2", "memory");
01270 }
01271 }
01272
01273
01274 void dsp_x86_sse_ma (double *dpVect, double dMul, double dAdd, int iDataLength)
01275 {
01276 int iDataCntr;
01277
01278 X86_ASM (
01279 "movsd %0, %%xmm1\n\t" \
01280 "movsd %1, %%xmm2\n\t"
01281 :
01282 : "m" (dMul),
01283 "m" (dAdd)
01284 : "xmm1", "xmm2", "memory");
01285 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01286 {
01287 X86_ASM (
01288 "movsd %1, %%xmm0\n\t" \
01289 "mulsd %%xmm1, %%xmm0\n\t" \
01290 "addsd %%xmm2, %%xmm0\n\t" \
01291 "movsd %%xmm0, %0\n\t"
01292 : "=m" (dpVect[iDataCntr])
01293 : "m0" (dpVect[iDataCntr])
01294 : "xmm0", "xmm1", "xmm2", "memory");
01295 }
01296 }
01297
01298
01299 void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
01300 float fMul, float fAdd, int iDataLength)
01301 {
01302 int iDataCntr;
01303 int iDataCount;
01304 stpm64 m64pDest = (stpm64) fpDest;
01305 stpm64 m64pSrc = (stpm64) fpSrc;
01306 stm64 m64Mul;
01307 stm64 m64Add;
01308
01309 m64Mul.f[0] = m64Mul.f[1] = fMul;
01310 m64Add.f[0] = m64Add.f[1] = fAdd;
01311 iDataCount = (iDataLength >> 1);
01312 X86_ASM (
01313 "movq %0, %%mm1\n\t" \
01314 "movq %1, %%mm2\n\t"
01315 :
01316 : "m" (m64Mul),
01317 "m" (m64Add)
01318 : "mm1", "mm2", "memory");
01319 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01320 {
01321 X86_ASM (
01322 "movq %1, %%mm0\n\t" \
01323 "pfmul %%mm1, %%mm0\n\t" \
01324 "pfadd %%mm2, %%mm0\n\t" \
01325 "movntq %%mm0, %0\n\t"
01326 : "=m" (m64pDest[iDataCntr])
01327 : "m" (m64pSrc[iDataCntr])
01328 : "mm0", "mm1", "mm2", "memory");
01329 }
01330 if (iDataLength & 0x1)
01331 {
01332 X86_ASM (
01333 "movd %1, %%mm0\n\t" \
01334 "pfmul %%mm1, %%mm0\n\t" \
01335 "pfadd %%mm2, %%mm0\n\t" \
01336 "movd %%mm0, %0\n\t"
01337 : "=m" (fpDest[iDataLength - 1])
01338 : "m" (fpSrc[iDataLength - 1])
01339 : "mm0", "mm1", "mm2", "memory");
01340 }
01341 X86_ASM (
01342 "femms\n\t" \
01343 "sfence\n\t");
01344 }
01345
01346
01347 void dsp_x86_sse_ma2f (float *fpDest, const float *fpSrc,
01348 float fMul, float fAdd, int iDataLength)
01349 {
01350 int iDataCntr;
01351
01352 X86_ASM (
01353 "movss %0, %%xmm1\n\t" \
01354 "movss %1, %%xmm2\n\t"
01355 :
01356 : "m" (fMul),
01357 "m" (fAdd)
01358 : "xmm1", "xmm2", "memory");
01359 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01360 {
01361 X86_ASM (
01362 "movss %1, %%xmm0\n\t" \
01363 "mulss %%xmm1, %%xmm0\n\t" \
01364 "addss %%xmm2, %%xmm0\n\t" \
01365 "movss %%xmm0, %0\n\t"
01366 : "=m" (fpDest[iDataCntr])
01367 : "m" (fpSrc[iDataCntr])
01368 : "xmm0", "xmm1", "xmm2", "memory");
01369 }
01370 }
01371
01372
01373 void dsp_x86_sse_ma2 (double *dpDest, const double *dpSrc,
01374 double dMul, double dAdd, int iDataLength)
01375 {
01376 int iDataCntr;
01377
01378 X86_ASM (
01379 "movsd %0, %%xmm1\n\t" \
01380 "movsd %1, %%xmm2\n\t"
01381 :
01382 : "m" (dMul),
01383 "m" (dAdd)
01384 : "xmm1", "xmm2", "memory");
01385 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01386 {
01387 X86_ASM (
01388 "movsd %1, %%xmm0\n\t" \
01389 "mulsd %%xmm1, %%xmm0\n\t" \
01390 "addsd %%xmm2, %%xmm0\n\t" \
01391 "movsd %%xmm0, %0\n\t"
01392 : "=m" (dpDest[iDataCntr])
01393 : "m" (dpSrc[iDataCntr])
01394 : "xmm0", "xmm1", "xmm2", "memory");
01395 }
01396 }
01397
01398
01399 void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
01400 {
01401 int iDataCntr;
01402 int iDataCount;
01403 stpm64 m64pVect = (stpm64) fpVect;
01404 stm64 m64Add;
01405 stm64 m64Mul;
01406
01407 m64Add.f[0] = m64Add.f[1] = fAdd;
01408 m64Mul.f[0] = m64Mul.f[1] = fMul;
01409 iDataCount = (iDataLength >> 1);
01410 X86_ASM (
01411 "movq %0, %%mm1\n\t" \
01412 "movq %1, %%mm2\n\t"
01413 :
01414 : "m" (m64Add),
01415 "m" (m64Mul)
01416 : "mm1", "mm2", "memory");
01417 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01418 {
01419 X86_ASM (
01420 "movq %1, %%mm0\n\t" \
01421 "pfadd %%mm1, %%mm0\n\t" \
01422 "pfmul %%mm2, %%mm0\n\t" \
01423 "movntq %%mm0, %0\n\t"
01424 : "=m" (m64pVect[iDataCntr])
01425 : "m0" (m64pVect[iDataCntr])
01426 : "mm0", "mm1", "mm2", "memory");
01427 }
01428 if (iDataLength & 0x1)
01429 {
01430 X86_ASM (
01431 "movd %1, %%mm0\n\t" \
01432 "pfadd %%mm1, %%mm0\n\t" \
01433 "pfmul %%mm2, %%mm0\n\t" \
01434 "movd %%mm0, %0\n\t"
01435 : "=m" (fpVect[iDataLength - 1])
01436 : "m0" (fpVect[iDataLength - 1])
01437 : "mm0", "mm1", "mm2", "memory");
01438 }
01439 X86_ASM (
01440 "femms\n\t" \
01441 "sfence\n\t");
01442 }
01443
01444
01445 void dsp_x86_sse_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
01446 {
01447 int iDataCntr;
01448
01449 X86_ASM (
01450 "movss %0, %%xmm1\n\t" \
01451 "movss %1, %%xmm2\n\t"
01452 :
01453 : "m" (fAdd),
01454 "m" (fMul)
01455 : "xmm1", "xmm2", "memory");
01456 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01457 {
01458 X86_ASM (
01459 "movss %1, %%xmm0\n\t" \
01460 "addss %%xmm1, %%xmm0\n\t" \
01461 "mulss %%xmm2, %%xmm0\n\t" \
01462 "movss %%xmm0, %0\n\t"
01463 : "=m" (fpVect[iDataCntr])
01464 : "m0" (fpVect[iDataCntr])
01465 : "xmm0", "xmm1", "xmm2", "memory");
01466 }
01467 }
01468
01469
01470 float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2,
01471 int iDataLength)
01472 {
01473 int iDataCntr;
01474 int iDataCount;
01475 float fRes;
01476 stpm64 m64pSrc1 = (stpm64) fpSrc1;
01477 stpm64 m64pSrc2 = (stpm64) fpSrc2;
01478
01479 iDataCount = (iDataLength >> 1);
01480 X86_ASM (
01481 "pxor %%mm0, %%mm0\n\t"
01482 :
01483 :
01484 : "mm0");
01485 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01486 {
01487 X86_ASM (
01488 "movq %0, %%mm1\n\t" \
01489 "movq %1, %%mm2\n\t" \
01490 "pfmul %%mm2, %%mm1\n\t" \
01491 "pfacc %%mm1, %%mm0\n\t"
01492 :
01493 : "m" (m64pSrc1[iDataCntr]),
01494 "m" (m64pSrc2[iDataCntr])
01495 : "mm0", "mm1", "mm2", "memory");
01496 }
01497 if (iDataLength & 0x1)
01498 {
01499 X86_ASM (
01500 "movd %0, %%mm1\n\t" \
01501 "movd %1, %%mm2\n\t" \
01502 "pfmul %%mm2, %%mm1\n\t" \
01503 "pfacc %%mm1, %%mm0\n\t"
01504 :
01505 : "m" (fpSrc1[iDataLength - 1]),
01506 "m" (fpSrc2[iDataLength - 1])
01507 : "mm0", "mm1", "mm2", "memory");
01508 }
01509 X86_ASM (
01510 "pfacc %%mm0, %%mm0\n\t" \
01511 "movd %%mm0, %0\n\t"
01512 : "=m" (fRes)
01513 :
01514 : "mm0", "memory");
01515 X86_ASM ("femms\n\t");
01516
01517 return fRes;
01518 }
01519
01520
01521 float dsp_x86_sse_macf (const float *fpSrc1, const float *fpSrc2,
01522 int iDataLength)
01523 {
01524 int iDataCntr;
01525 float fRes;
01526
01527 X86_ASM (
01528 "xorps %%xmm0, %%xmm0\n\t"
01529 :
01530 :
01531 : "xmm0");
01532 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01533 {
01534 X86_ASM (
01535 "movss %0, %%xmm1\n\t" \
01536 "mulss %1, %%xmm1\n\t" \
01537 "addss %%xmm1, %%xmm0\n\t"
01538 :
01539 : "m" (fpSrc1[iDataCntr]),
01540 "m" (fpSrc2[iDataCntr])
01541 : "xmm0", "xmm1", "xmm2", "memory");
01542 }
01543 X86_ASM (
01544 "movss %%xmm0, %0\n\t"
01545 : "=m" (fRes)
01546 :
01547 : "xmm0");
01548
01549 return fRes;
01550 }
01551
01552
01553 double dsp_x86_sse_mac (const double *dpSrc1, const double *dpSrc2,
01554 int iDataLength)
01555 {
01556 int iDataCntr;
01557 double dRes;
01558
01559 X86_ASM (
01560 "xorpd %%xmm0, %%xmm0\n\t"
01561 :
01562 :
01563 : "xmm0");
01564 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01565 {
01566 X86_ASM (
01567 "movsd %0, %%xmm1\n\t" \
01568 "mulsd %1, %%xmm1\n\t" \
01569 "addsd %%xmm1, %%xmm0\n\t"
01570 :
01571 : "m" (dpSrc1[iDataCntr]),
01572 "m" (dpSrc2[iDataCntr])
01573 : "xmm0", "xmm1", "xmm2", "memory");
01574 }
01575 X86_ASM (
01576 "movsd %%xmm0, %0\n\t"
01577 : "=m" (dRes)
01578 :
01579 : "xmm0");
01580
01581 return dRes;
01582 }
01583
01584
01585 void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
01586 int iDataLength)
01587 {
01588 int iDataCntr;
01589 int iDataCount;
01590 stm64 m64Min;
01591 stm64 m64Max;
01592 stpm64 m64pSrc = (stpm64) fpSrc;
01593
01594 m64Min.f[0] = m64Min.f[1] = FLT_MAX;
01595 m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
01596 iDataCount = (iDataLength >> 1);
01597 X86_ASM (
01598 "movq %0, %%mm1\n\t" \
01599 "movq %1, %%mm2\n\t"
01600 :
01601 : "m" (m64Min),
01602 "m" (m64Max)
01603 : "mm1", "mm2", "memory");
01604 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01605 {
01606 X86_ASM (
01607 "movq %0, %%mm0\n\t" \
01608 "pfmin %%mm0, %%mm1\n\t" \
01609 "pfmax %%mm0, %%mm2\n\t"
01610 :
01611 : "m" (m64pSrc[iDataCntr])
01612 : "mm0", "mm1", "mm2", "memory");
01613 }
01614 if (iDataLength & 0x1)
01615 {
01616 X86_ASM (
01617 "movd %0, %%mm0\n\t" \
01618 "pfmin %%mm0, %%mm1\n\t" \
01619 "pfmax %%mm0, %%mm2\n\t"
01620 :
01621 : "m" (fpSrc[iDataLength - 1])
01622 : "mm0", "mm1", "mm2", "memory");
01623 }
01624 X86_ASM (
01625 "pswapd %%mm1, %%mm3\n\t" \
01626 "pfmin %%mm3, %%mm1\n\t" \
01627 "pswapd %%mm2, %%mm3\n\t" \
01628 "pfmax %%mm3, %%mm2\n\t" \
01629 "movd %%mm1, %0\n\t" \
01630 "movd %%mm2, %1\n\t"
01631 : "=m" (*fpMin),
01632 "=m" (*fpMax)
01633 :
01634 : "mm1", "mm2", "mm3", "memory");
01635 X86_ASM ("femms\n\t");
01636 }
01637
01638
01639 void dsp_x86_sse_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
01640 int iDataLength)
01641 {
01642 int iDataCntr;
01643
01644 *fpMin = FLT_MAX;
01645 *fpMax = -FLT_MAX;
01646 X86_ASM (
01647 "movss %0, %%xmm0\n\t" \
01648 "movss %1, %%xmm1\n\t"
01649 :
01650 : "m" (*fpMin),
01651 "m" (*fpMax)
01652 : "xmm0", "xmm1", "memory");
01653 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01654 {
01655 X86_ASM (
01656 "movss %0, %%xmm2\n\t" \
01657 "minss %%xmm2, %%xmm0\n\t" \
01658 "maxss %%xmm2, %%xmm1\n\t"
01659 :
01660 : "m" (fpSrc[iDataCntr])
01661 : "xmm0", "xmm1", "xmm2", "memory");
01662 }
01663 X86_ASM (
01664 "movss %%xmm0, %0\n\t" \
01665 "movss %%xmm1, %1\n\t"
01666 : "=m" (*fpMin),
01667 "=m" (*fpMax)
01668 :
01669 : "xmm0", "xmm1", "memory");
01670 }
01671
01672
01673 void dsp_x86_sse_minmax (double *dpMin, double *dpMax, const double *dpSrc,
01674 int iDataLength)
01675 {
01676 int iDataCntr;
01677
01678 *dpMin = FLT_MAX;
01679 *dpMax = -FLT_MAX;
01680 X86_ASM (
01681 "movsd %0, %%xmm0\n\t" \
01682 "movsd %1, %%xmm1\n\t"
01683 :
01684 : "m" (*dpMin),
01685 "m" (*dpMax)
01686 : "xmm0", "xmm1", "memory");
01687 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01688 {
01689 X86_ASM (
01690 "movsd %0, %%xmm2\n\t" \
01691 "minsd %%xmm2, %%xmm0\n\t" \
01692 "maxsd %%xmm2, %%xmm1\n\t"
01693 :
01694 : "m" (dpSrc[iDataCntr])
01695 : "xmm0", "xmm1", "xmm2", "memory");
01696 }
01697 X86_ASM (
01698 "movss %%xmm0, %0\n\t" \
01699 "movss %%xmm1, %1\n\t"
01700 : "=m" (*dpMin),
01701 "=m" (*dpMax)
01702 :
01703 : "xmm0", "xmm1", "memory");
01704 }
01705
01706
01707 float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01708 int iDataLength)
01709 {
01710 int iDataCntr;
01711 int iDataCount;
01712 float fRes;
01713 stpm64 m64pSrc1 = (stpm64) fpSrc1;
01714 stpm64 m64pSrc2 = (stpm64) fpSrc2;
01715
01716 iDataCount = (iDataLength >> 1);
01717 X86_ASM (
01718 "pxor %%mm3, %%mm3\n\t" \
01719 "pxor %%mm4, %%mm4\n\t" \
01720 "pxor %%mm5, %%mm5\n\t"
01721 :
01722 :
01723 : "mm3", "mm4", "mm5");
01724 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01725 {
01726 X86_ASM (
01727 "movq %0, %%mm0\n\t" \
01728 "movq %1, %%mm1\n\t" \
01729 "movq %%mm1, %%mm2\n\t" \
01730 "pfmul %%mm0, %%mm2\n\t" \
01731 "pfacc %%mm2, %%mm5\n\t" \
01732 "pfmul %%mm0, %%mm0\n\t" \
01733 "pfacc %%mm0, %%mm3\n\t" \
01734 "pfmul %%mm1, %%mm1\n\t" \
01735 "pfacc %%mm1, %%mm4\n\t"
01736 :
01737 : "m" (m64pSrc1[iDataCntr]),
01738 "m" (m64pSrc2[iDataCntr])
01739 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01740 }
01741 if (iDataLength & 0x1)
01742 {
01743 X86_ASM (
01744 "movd %0, %%mm0\n\t" \
01745 "movd %1, %%mm1\n\t" \
01746 "movq %%mm1, %%mm2\n\t" \
01747 "pfmul %%mm0, %%mm2\n\t" \
01748 "pfacc %%mm2, %%mm5\n\t" \
01749 "pfmul %%mm0, %%mm0\n\t" \
01750 "pfacc %%mm0, %%mm3\n\t" \
01751 "pfmul %%mm1, %%mm1\n\t" \
01752 "pfacc %%mm1, %%mm4\n\t"
01753 :
01754 : "m" (fpSrc1[iDataLength - 1]),
01755 "m" (fpSrc2[iDataLength - 1])
01756 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01757 }
01758 X86_ASM (
01759 "pfacc %%mm3, %%mm3\n\t" \
01760 "pfacc %%mm4, %%mm4\n\t" \
01761 "pfacc %%mm5, %%mm5\n\t" \
01762 \
01763 "movd %1, %%mm6\n\t" \
01764 "pswapd %%mm6, %%mm7\n\t" \
01765 "paddd %%mm7, %%mm6\n\t" \
01766 "pi2fd %%mm6, %%mm7\n\t" \
01767 \
01768 "pfrcp %%mm7, %%mm6\n\t" \
01769 "pfrcpit1 %%mm6, %%mm7\n\t" \
01770 "pfrcpit2 %%mm6, %%mm7\n\t" \
01771 \
01772 "pfmul %%mm3, %%mm4\n\t" \
01773 \
01774 "movq %%mm4, %%mm0\n\t" \
01775 "pfrsqrt %%mm4, %%mm1\n\t" \
01776 "movq %%mm1, %%mm2\n\t" \
01777 "pfmul %%mm1, %%mm1\n\t" \
01778 "pfrsqit1 %%mm4, %%mm1\n\t" \
01779 "pfrcpit2 %%mm2, %%mm1\n\t" \
01780 "pfmul %%mm1, %%mm4\n\t" \
01781 \
01782 "pfmul %%mm6, %%mm4\n\t" \
01783 \
01784 "pfrcp %%mm4, %%mm0\n\t" \
01785 "pfrcpit1 %%mm0, %%mm4\n\t" \
01786 "pfrcpit2 %%mm0, %%mm4\n\t" \
01787 \
01788 "pfmul %%mm6, %%mm5\n\t" \
01789 "pfmul %%mm4, %%mm5\n\t" \
01790 "movd %%mm5, %0\n\t"
01791 : "=m" (fRes)
01792 : "m" (iDataLength)
01793 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01794 X86_ASM ("femms\n\t");
01795
01796 return fRes;
01797 }
01798
01799
01800 float dsp_x86_sse_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01801 int iDataLength)
01802 {
01803 int iDataCntr;
01804 float fScale;
01805 float fNormFact;
01806 float fProdSum;
01807 float fSqSum1;
01808 float fSqSum2;
01809 float fRes;
01810
01811 X86_ASM (
01812 "xorps %%xmm0, %%xmm0\n\t" \
01813 "xorps %%xmm1, %%xmm1\n\t" \
01814 "xorps %%xmm2, %%xmm2\n\t"
01815 :
01816 :
01817 : "xmm0", "xmm1", "xmm2");
01818 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01819 {
01820 X86_ASM (
01821 "movss %3, %%xmm3\n\t" \
01822 "movss %4, %%xmm4\n\t" \
01823 \
01824 "movss %%xmm4, %%xmm5\n\t" \
01825 "mulss %%xmm3, %%xmm5\n\t" \
01826 "addss %%xmm5, %%xmm0\n\t" \
01827 \
01828 "movss %%xmm3, %%xmm5\n\t" \
01829 "mulss %%xmm3, %%xmm5\n\t" \
01830 "addss %%xmm5, %%xmm1\n\t" \
01831 \
01832 "movss %%xmm4, %%xmm5\n\t" \
01833 "mulss %%xmm4, %%xmm5\n\t" \
01834 "addss %%xmm5, %%xmm2\n\t" \
01835 \
01836 "movss %%xmm0, %0\n\t" \
01837 "movss %%xmm1, %1\n\t" \
01838 "movss %%xmm2, %2\n\t"
01839 : "=m" (fProdSum),
01840 "=m" (fSqSum1),
01841 "=m" (fSqSum2)
01842 : "m" (fpSrc1[iDataCntr]),
01843 "m" (fpSrc2[iDataCntr])
01844 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
01845 }
01846 fScale = 1.0F / iDataLength;
01847 fNormFact = sqrtf(fSqSum1 * fSqSum2) * fScale;
01848 fRes = (fProdSum * fScale) / fNormFact;
01849
01850 return fRes;
01851 }
01852
01853
01854 double dsp_x86_sse_crosscorr (const double *dpSrc1, const double *dpSrc2,
01855 int iDataLength)
01856 {
01857 int iDataCntr;
01858 double dScale;
01859 double dNormFact;
01860 double dProdSum;
01861 double dSqSum1;
01862 double dSqSum2;
01863 double dRes;
01864
01865 X86_ASM (
01866 "xorpd %%xmm0, %%xmm0\n\t" \
01867 "xorpd %%xmm1, %%xmm1\n\t" \
01868 "xorpd %%xmm2, %%xmm2\n\t"
01869 :
01870 :
01871 : "xmm0", "xmm1", "xmm2");
01872 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01873 {
01874 X86_ASM (
01875 "movsd %3, %%xmm3\n\t" \
01876 "movsd %4, %%xmm4\n\t" \
01877 \
01878 "movsd %%xmm4, %%xmm5\n\t" \
01879 "mulsd %%xmm3, %%xmm5\n\t" \
01880 "addsd %%xmm5, %%xmm0\n\t" \
01881 \
01882 "movsd %%xmm3, %%xmm5\n\t" \
01883 "mulsd %%xmm3, %%xmm5\n\t" \
01884 "addsd %%xmm5, %%xmm1\n\t" \
01885 \
01886 "movsd %%xmm4, %%xmm5\n\t" \
01887 "mulsd %%xmm4, %%xmm5\n\t" \
01888 "addsd %%xmm5, %%xmm2\n\t" \
01889 \
01890 "movsd %%xmm0, %0\n\t" \
01891 "movsd %%xmm1, %1\n\t" \
01892 "movsd %%xmm2, %2\n\t"
01893 : "=m" (dProdSum),
01894 "=m" (dSqSum1),
01895 "=m" (dSqSum2)
01896 : "m" (dpSrc1[iDataCntr]),
01897 "m" (dpSrc2[iDataCntr])
01898 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
01899 }
01900 dScale = 1.0 / iDataLength;
01901 dNormFact = sqrt(dSqSum1 * dSqSum2) * dScale;
01902 dRes = (dProdSum * dScale) / dNormFact;
01903
01904 return dRes;
01905 }
01906
01907
01908 void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
01909 int iIntMax)
01910 {
01911 int iDataCntr;
01912 float fScale;
01913
01914 X86_ASM (
01915 "movd %1, %%mm1\n\t" \
01916 "pswapd %%mm1, %%mm2\n\t" \
01917 "paddd %%mm2, %%mm1\n\t" \
01918 "pi2fd %%mm1, %%mm1\n\t" \
01919 "pfrcp %%mm1, %%mm2\n\t" \
01920 "pfrcpit1 %%mm2, %%mm1\n\t" \
01921 "pfrcpit2 %%mm2, %%mm1\n\t" \
01922 "movd %%mm1, %0\n\t"
01923 : "=m" (fScale)
01924 : "m" (iIntMax)
01925 : "mm1", "mm2", "memory");
01926 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01927 {
01928 X86_ASM (
01929 "movd %1, %%mm0\n\t" \
01930 "punpcklwd %%mm0, %%mm0\n\t" \
01931 "pi2fw %%mm0, %%mm0\n\t" \
01932 "pfmul %%mm1, %%mm0\n\t" \
01933 "movntq %%mm0, %0\n\t"
01934 : "=m" (fpDest[iDataCntr])
01935 : "m" (ipSrc[iDataCntr])
01936 : "mm0", "mm1", "memory");
01937 }
01938 X86_ASM (
01939 "femms\n\t" \
01940 "sfence\n\t");
01941 if ((iDataLength % 2) != 0)
01942 {
01943 fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01944 }
01945 }
01946
01947
01948 void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
01949 int iIntMax)
01950 {
01951 int iDataCntr;
01952 float fScale;
01953
01954 X86_ASM (
01955 "movd %1, %%mm1\n\t" \
01956 "pswapd %%mm1, %%mm2\n\t" \
01957 "paddd %%mm2, %%mm1\n\t" \
01958 "pi2fd %%mm1, %%mm1\n\t" \
01959 "pfrcp %%mm1, %%mm2\n\t" \
01960 "pfrcpit1 %%mm2, %%mm1\n\t" \
01961 "pfrcpit2 %%mm2, %%mm1\n\t" \
01962 "movd %%mm1, %0\n\t"
01963 : "=m" (fScale)
01964 : "m" (iIntMax)
01965 : "mm1", "mm2", "memory");
01966 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01967 {
01968 X86_ASM (
01969 "movq %1, %%mm0\n\t" \
01970 "pi2fd %%mm0, %%mm0\n\t" \
01971 "pfmul %%mm1, %%mm0\n\t" \
01972 "movntq %%mm0, %0\n\t"
01973 : "=m" (fpDest[iDataCntr])
01974 : "m" (ipSrc[iDataCntr])
01975 : "mm0", "mm1", "memory");
01976 }
01977 X86_ASM (
01978 "femms\n\t" \
01979 "sfence\n\t");
01980 if ((iDataLength % 2) != 0)
01981 {
01982 fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01983 }
01984 }
01985
01986
01987 void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength,
01988 const float *fpCoeff, int iCoeffLength)
01989 {
01990 int iSrcCntr;
01991 int iDestCntr;
01992 int iCoeffCntr;
01993 int iSrcCount;
01994 stpm64 m64pDest = (stpm64) fpDest;
01995
01996 iDestCntr = 0;
01997 iSrcCount = iDataLength + iCoeffLength;
01998 for (iSrcCntr = iCoeffLength;
01999 iSrcCntr < iSrcCount;
02000 iSrcCntr += 2)
02001 {
02002 X86_ASM (
02003 "pxor %%mm0, %%mm0\n\t"
02004 :
02005 :
02006 : "mm0");
02007 for (iCoeffCntr = 0;
02008 iCoeffCntr < iCoeffLength;
02009 iCoeffCntr++)
02010 {
02011 X86_ASM (
02012 "movq %0, %%mm1\n\t" \
02013 "movd %1, %%mm2\n\t" \
02014 "pswapd %%mm2, %%mm3\n\t" \
02015 "pfadd %%mm3, %%mm2\n\t" \
02016 "pfmul %%mm2, %%mm1\n\t" \
02017 "pfadd %%mm1, %%mm0\n\t"
02018 :
02019 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
02020 "m" (fpCoeff[iCoeffCntr])
02021 : "mm0", "mm1", "mm2", "mm3", "memory");
02022 }
02023 X86_ASM (
02024 "movntq %%mm0, %0\n\t"
02025 : "=m" (m64pDest[iDestCntr++])
02026 :
02027 : "mm0", "memory");
02028 }
02029 if (iDataLength & 0x1)
02030 {
02031 X86_ASM (
02032 "pxor %%mm0, %%mm0\n\t"
02033 :
02034 :
02035 : "mm0");
02036 for (iCoeffCntr = 0;
02037 iCoeffCntr < iCoeffLength;
02038 iCoeffCntr++)
02039 {
02040 X86_ASM (
02041 "movd %0, %%mm1\n\t" \
02042 "movd %1, %%mm2\n\t" \
02043 "pfmul %%mm2, %%mm1\n\t" \
02044 "pfadd %%mm1, %%mm0\n\t"
02045 :
02046 : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
02047 "m" (fpCoeff[iCoeffCntr])
02048 : "mm0", "mm1", "mm2", "memory");
02049 }
02050 X86_ASM (
02051 "movd %%mm0, %0\n\t"
02052 : "=m" (fpDest[iDataLength - 1])
02053 :
02054 : "mm0", "memory");
02055 }
02056 X86_ASM (
02057 "femms\n\t" \
02058 "sfence\n\t");
02059 }
02060
02061
02062 void dsp_x86_sse_firf (float *fpDest, const float *fpSrc, int iDataLength,
02063 const float *fpCoeff, int iCoeffLength)
02064 {
02065 int iDestCntr;
02066 int iSrcCntr;
02067 int iCoeffCntr;
02068 int iSrcCount;
02069
02070 iDestCntr = 0;
02071 iSrcCount = iDataLength + iCoeffLength;
02072 for (iSrcCntr = iCoeffLength;
02073 iSrcCntr < iSrcCount;
02074 iSrcCntr++)
02075 {
02076 X86_ASM (
02077 "xorps %%xmm0, %%xmm0\n\t"
02078 :
02079 :
02080 : "xmm0");
02081 for (iCoeffCntr = 0;
02082 iCoeffCntr < iCoeffLength;
02083 iCoeffCntr++)
02084 {
02085 X86_ASM (
02086 "movss %0, %%xmm1\n\t"
02087 "mulss %1, %%xmm1\n\t"
02088 "addss %%xmm1, %%xmm0\n\t"
02089 :
02090 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
02091 "m" (fpCoeff[iCoeffCntr])
02092 : "xmm0", "xmm1", "memory");
02093 }
02094 X86_ASM (
02095 "movss %%xmm0, %0\n\t"
02096 : "=m" (fpDest[iDestCntr++])
02097 :
02098 : "xmm0", "memory");
02099 }
02100 }
02101
02102
02103 void dsp_x86_sse_fir (double *dpDest, const double *dpSrc, int iDataLength,
02104 const double *dpCoeff, int iCoeffLength)
02105 {
02106 int iDestCntr;
02107 int iSrcCntr;
02108 int iCoeffCntr;
02109 int iSrcCount;
02110
02111 iDestCntr = 0;
02112 iSrcCount = iDataLength + iCoeffLength;
02113 for (iSrcCntr = iCoeffLength;
02114 iSrcCntr < iSrcCount;
02115 iSrcCntr++)
02116 {
02117 X86_ASM (
02118 "xorpd %%xmm0, %%xmm0\n\t"
02119 :
02120 :
02121 : "xmm0");
02122 for (iCoeffCntr = 0;
02123 iCoeffCntr < iCoeffLength;
02124 iCoeffCntr++)
02125 {
02126 X86_ASM (
02127 "movsd %0, %%xmm1\n\t"
02128 "mulsd %1, %%xmm1\n\t"
02129 "addsd %%xmm1, %%xmm0\n\t"
02130 :
02131 : "m" (dpSrc[iSrcCntr - iCoeffCntr]),
02132 "m" (dpCoeff[iCoeffCntr])
02133 : "xmm0", "xmm1", "memory");
02134 }
02135 X86_ASM (
02136 "movsd %%xmm0, %0\n\t"
02137 : "=m" (dpDest[iDestCntr++])
02138 :
02139 : "xmm0", "memory");
02140 }
02141 }
02142
02143
02144 void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
02145 float *fpX, float *fpY)
02146 {
02147 int iDataCntr;
02148 stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
02149 stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
02150 stpm64 m64pX = (stpm64) fpX;
02151 stpm64 m64pY = (stpm64) fpY;
02152
02153 X86_ASM (
02154 "movq %0, %%mm0\n\t" \
02155 "pswapd %%mm0, %%mm2\n\t" \
02156 "movd %1, %%mm3\n\t" \
02157 "movq %2, %%mm0\n\t" \
02158 "pswapd %%mm0, %%mm4\n\t" \
02159 "movq %3, %%mm5\n\t" \
02160 "movq %4, %%mm7\n\t" \
02161 :
02162 : "m" (*m64pCoeff),
02163 "m" (fpCoeff[0]),
02164 "m" (*m64pCoeff2),
02165 "m" (*m64pX),
02166 "m" (*m64pY)
02167 : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
02168 for (iDataCntr = 0;
02169 iDataCntr < iDataLength;
02170 iDataCntr++)
02171 {
02172 X86_ASM (
02173 "pxor %%mm0, %%mm0\n\t" \
02174 "movd %1, %%mm6\n\t" \
02175 "movq %%mm5, %%mm1\n\t" \
02176 "pfmul %%mm2, %%mm1\n\t" \
02177 "pfacc %%mm1, %%mm0\n\t" \
02178 "movq %%mm6, %%mm1\n\t" \
02179 "pfmul %%mm3, %%mm1\n\t" \
02180 "pfacc %%mm1, %%mm0\n\t" \
02181 "movq %%mm7, %%mm1\n\t" \
02182 "pfmul %%mm4, %%mm1\n\t" \
02183 "pfacc %%mm1, %%mm0\n\t" \
02184 "pfacc %%mm0, %%mm0\n\t" \
02185 \
02186 "pswapd %%mm7, %%mm1\n\t" \
02187 "movq %%mm1, %%mm7\n\t" \
02188 "punpckldq %%mm0, %%mm7\n\t" \
02189 \
02190 "pswapd %%mm5, %%mm1\n\t" \
02191 "movq %%mm1, %%mm5\n\t" \
02192 "movq %%mm6, %%mm1\n\t" \
02193 "punpckldq %%mm1, %%mm5\n\t" \
02194 \
02195 "movd %%mm0, %0\n\t"
02196 : "=m" (fpVect[iDataCntr])
02197 : "m0" (fpVect[iDataCntr])
02198 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
02199 }
02200 X86_ASM (
02201 "movq %%mm5, %0\n\t" \
02202 "movd %%mm6, %1\n\t" \
02203 "movq %%mm7, %2\n\t"
02204 : "=m" (*m64pX),
02205 "=m" (fpX[2]),
02206 "=m" (*m64pY)
02207 :
02208 : "mm5", "mm6", "mm7", "memory");
02209 X86_ASM ("femms\n\t");
02210 }
02211
02212
02213 void dsp_x86_sse_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
02214 float *fpX, float *fpY)
02215 {
02216 int iDataCntr;
02217
02218 X86_ASM (
02219 "movss %0, %%xmm1\n\t" \
02220 "movss %1, %%xmm2\n\t" \
02221 "movss %2, %%xmm3\n\t" \
02222 "movss %3, %%xmm4\n\t" \
02223 "prefetchnta %4\n\t"
02224 :
02225 : "m" (fpX[1]),
02226 "m" (fpX[2]),
02227 "m" (fpY[0]),
02228 "m" (fpY[1]),
02229 "m" (fpCoeff[0])
02230 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02231 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02232 {
02233 X86_ASM (
02234 "movss %%xmm1, %%xmm0\n\t" \
02235 "movss %%xmm2, %%xmm1\n\t" \
02236 "movss %1, %%xmm2\n\t" \
02237 \
02238 "movss %2, %%xmm5\n\t" \
02239 "mulss %%xmm2, %%xmm5\n\t" \
02240 "movss %3, %%xmm6\n\t" \
02241 "mulss %%xmm1, %%xmm6\n\t" \
02242 "addss %%xmm6, %%xmm5\n\t" \
02243 "movss %4, %%xmm6\n\t" \
02244 "mulss %%xmm0, %%xmm6\n\t" \
02245 "addss %%xmm6, %%xmm5\n\t" \
02246 \
02247 "movss %5, %%xmm6\n\t" \
02248 "mulss %%xmm4, %%xmm6\n\t" \
02249 "movss %6, %%xmm7\n\t" \
02250 "mulss %%xmm3, %%xmm7\n\t" \
02251 "addss %%xmm7, %%xmm6\n\t" \
02252 \
02253 "addss %%xmm5, %%xmm6\n\t" \
02254 "movss %%xmm4, %%xmm3\n\t" \
02255 "movss %%xmm6, %%xmm4\n\t" \
02256 \
02257 "movss %%xmm6, %0\n\t"
02258 : "=m" (fpVect[iDataCntr])
02259 : "m0" (fpVect[iDataCntr]),
02260 "m" (fpCoeff[0]),
02261 "m" (fpCoeff[1]),
02262 "m" (fpCoeff[2]),
02263 "m" (fpCoeff[3]),
02264 "m" (fpCoeff[4])
02265 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02266 "memory");
02267 }
02268 X86_ASM (
02269 "movss %%xmm0, %0\n\t" \
02270 "movss %%xmm1, %1\n\t" \
02271 "movss %%xmm2, %2\n\t" \
02272 "movss %%xmm3, %3\n\t" \
02273 "movss %%xmm4, %4\n\t"
02274 : "=m" (fpX[0]),
02275 "=m" (fpX[1]),
02276 "=m" (fpX[2]),
02277 "=m" (fpY[0]),
02278 "=m" (fpY[1])
02279 :
02280 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02281 }
02282
02283
02284 void dsp_x86_sse_iir (double *dpVect, int iDataLength, const double *dpCoeff,
02285 double *dpX, double *dpY)
02286 {
02287 int iDataCntr;
02288
02289 X86_ASM (
02290 "movsd %0, %%xmm1\n\t" \
02291 "movsd %1, %%xmm2\n\t" \
02292 "movsd %2, %%xmm3\n\t" \
02293 "movsd %3, %%xmm4\n\t" \
02294 "prefetchnta %4\n\t" \
02295 "prefetchnta %5\n\t"
02296 :
02297 : "m" (dpX[1]),
02298 "m" (dpX[2]),
02299 "m" (dpY[0]),
02300 "m" (dpY[1]),
02301 "m" (dpCoeff[0]),
02302 "m" (dpCoeff[3])
02303 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02304 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02305 {
02306 X86_ASM (
02307 "movsd %%xmm1, %%xmm0\n\t" \
02308 "movsd %%xmm2, %%xmm1\n\t" \
02309 "movsd %1, %%xmm2\n\t" \
02310 \
02311 "movsd %2, %%xmm5\n\t" \
02312 "mulsd %%xmm2, %%xmm5\n\t" \
02313 "movsd %3, %%xmm6\n\t" \
02314 "mulsd %%xmm1, %%xmm6\n\t" \
02315 "addsd %%xmm6, %%xmm5\n\t" \
02316 "movsd %4, %%xmm6\n\t" \
02317 "mulsd %%xmm0, %%xmm6\n\t" \
02318 "addsd %%xmm6, %%xmm5\n\t" \
02319 \
02320 "movsd %5, %%xmm6\n\t" \
02321 "mulsd %%xmm4, %%xmm6\n\t" \
02322 "movsd %6, %%xmm7\n\t" \
02323 "mulsd %%xmm3, %%xmm7\n\t" \
02324 "addsd %%xmm7, %%xmm6\n\t" \
02325 \
02326 "addsd %%xmm5, %%xmm6\n\t" \
02327 "movsd %%xmm4, %%xmm3\n\t" \
02328 "movsd %%xmm6, %%xmm4\n\t" \
02329 \
02330 "movsd %%xmm6, %0\n\t"
02331 : "=m" (dpVect[iDataCntr])
02332 : "m0" (dpVect[iDataCntr]),
02333 "m" (dpCoeff[0]),
02334 "m" (dpCoeff[1]),
02335 "m" (dpCoeff[2]),
02336 "m" (dpCoeff[3]),
02337 "m" (dpCoeff[4])
02338 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02339 "memory");
02340 }
02341 X86_ASM (
02342 "movsd %%xmm0, %0\n\t" \
02343 "movsd %%xmm1, %1\n\t" \
02344 "movsd %%xmm2, %2\n\t" \
02345 "movsd %%xmm3, %3\n\t" \
02346 "movsd %%xmm4, %4\n\t"
02347 : "=m" (dpX[0]),
02348 "=m" (dpX[1]),
02349 "=m" (dpX[2]),
02350 "=m" (dpY[0]),
02351 "=m" (dpY[1])
02352 :
02353 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02354 }
02355
02356
02357 void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
02358 const float *fpCoeff, float *fpX, float *fpY)
02359 {
02360 int iDataCntr;
02361 stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
02362 stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
02363 stpm64 m64pX = (stpm64) fpX;
02364 stpm64 m64pY = (stpm64) fpY;
02365
02366 X86_ASM (
02367 "movq %0, %%mm0\n\t" \
02368 "pswapd %%mm0, %%mm2\n\t" \
02369 "movd %1, %%mm3\n\t" \
02370 "movq %2, %%mm0\n\t" \
02371 "pswapd %%mm0, %%mm4\n\t" \
02372 "movq %3, %%mm5\n\t" \
02373 "movq %4, %%mm7\n\t" \
02374 :
02375 : "m" (*m64pCoeff),
02376 "m" (fpCoeff[0]),
02377 "m" (*m64pCoeff2),
02378 "m" (*m64pX),
02379 "m" (*m64pY)
02380 : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
02381 for (iDataCntr = 0;
02382 iDataCntr < iDataLength;
02383 iDataCntr++)
02384 {
02385 X86_ASM (
02386 "pxor %%mm0, %%mm0\n\t" \
02387 "movd %1, %%mm6\n\t" \
02388 "movq %%mm5, %%mm1\n\t" \
02389 "pfmul %%mm2, %%mm1\n\t" \
02390 "pfacc %%mm1, %%mm0\n\t" \
02391 "movq %%mm6, %%mm1\n\t" \
02392 "pfmul %%mm3, %%mm1\n\t" \
02393 "pfacc %%mm1, %%mm0\n\t" \
02394 "movq %%mm7, %%mm1\n\t" \
02395 "pfmul %%mm4, %%mm1\n\t" \
02396 "pfacc %%mm1, %%mm0\n\t" \
02397 "pfacc %%mm0, %%mm0\n\t" \
02398 \
02399 "pswapd %%mm7, %%mm1\n\t" \
02400 "movq %%mm1, %%mm7\n\t" \
02401 "punpckldq %%mm0, %%mm7\n\t" \
02402 \
02403 "pswapd %%mm5, %%mm1\n\t" \
02404 "movq %%mm1, %%mm5\n\t" \
02405 "movq %%mm6, %%mm1\n\t" \
02406 "punpckldq %%mm1, %%mm5\n\t" \
02407 \
02408 "movd %%mm0, %0\n\t"
02409 : "=m" (fpDest[iDataCntr])
02410 : "m" (fpSrc[iDataCntr])
02411 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
02412 }
02413 X86_ASM (
02414 "movq %%mm5, %0\n\t" \
02415 "movd %%mm6, %1\n\t" \
02416 "movq %%mm7, %2\n\t"
02417 : "=m" (*m64pX),
02418 "=m" (fpX[2]),
02419 "=m" (*m64pY)
02420 :
02421 : "mm5", "mm6", "mm7", "memory");
02422 X86_ASM ("femms\n\t");
02423 }
02424
02425
02426 void dsp_x86_sse_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
02427 const float *fpCoeff, float *fpX, float *fpY)
02428 {
02429 int iDataCntr;
02430
02431 X86_ASM (
02432 "movss %0, %%xmm1\n\t" \
02433 "movss %1, %%xmm2\n\t" \
02434 "movss %2, %%xmm3\n\t" \
02435 "movss %3, %%xmm4\n\t" \
02436 "prefetchnta %4\n\t"
02437 :
02438 : "m" (fpX[1]),
02439 "m" (fpX[2]),
02440 "m" (fpY[0]),
02441 "m" (fpY[1]),
02442 "m" (fpCoeff[0])
02443 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02444 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02445 {
02446 X86_ASM (
02447 "movss %%xmm1, %%xmm0\n\t" \
02448 "movss %%xmm2, %%xmm1\n\t" \
02449 "movss %1, %%xmm2\n\t" \
02450 \
02451 "movss %2, %%xmm5\n\t" \
02452 "mulss %%xmm2, %%xmm5\n\t" \
02453 "movss %3, %%xmm6\n\t" \
02454 "mulss %%xmm1, %%xmm6\n\t" \
02455 "addss %%xmm6, %%xmm5\n\t" \
02456 "movss %4, %%xmm6\n\t" \
02457 "mulss %%xmm0, %%xmm6\n\t" \
02458 "addss %%xmm6, %%xmm5\n\t" \
02459 \
02460 "movss %5, %%xmm6\n\t" \
02461 "mulss %%xmm4, %%xmm6\n\t" \
02462 "movss %6, %%xmm7\n\t" \
02463 "mulss %%xmm3, %%xmm7\n\t" \
02464 "addss %%xmm7, %%xmm6\n\t" \
02465 \
02466 "addss %%xmm5, %%xmm6\n\t" \
02467 "movss %%xmm4, %%xmm3\n\t" \
02468 "movss %%xmm6, %%xmm4\n\t" \
02469 \
02470 "movss %%xmm6, %0\n\t"
02471 : "=m" (fpDest[iDataCntr])
02472 : "m" (fpSrc[iDataCntr]),
02473 "m" (fpCoeff[0]),
02474 "m" (fpCoeff[1]),
02475 "m" (fpCoeff[2]),
02476 "m" (fpCoeff[3]),
02477 "m" (fpCoeff[4])
02478 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02479 "memory");
02480 }
02481 X86_ASM (
02482 "movss %%xmm0, %0\n\t" \
02483 "movss %%xmm1, %1\n\t" \
02484 "movss %%xmm2, %2\n\t" \
02485 "movss %%xmm3, %3\n\t" \
02486 "movss %%xmm4, %4\n\t"
02487 : "=m" (fpX[0]),
02488 "=m" (fpX[1]),
02489 "=m" (fpX[2]),
02490 "=m" (fpY[0]),
02491 "=m" (fpY[1])
02492 :
02493 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02494 }
02495
02496
02497 void dsp_x86_sse_iir_nip (double *dpDest, const double *dpSrc, int iDataLength,
02498 const double *dpCoeff, double *dpX, double *dpY)
02499 {
02500 int iDataCntr;
02501
02502 X86_ASM (
02503 "movsd %0, %%xmm1\n\t" \
02504 "movsd %1, %%xmm2\n\t" \
02505 "movsd %2, %%xmm3\n\t" \
02506 "movsd %3, %%xmm4\n\t" \
02507 "prefetchnta %4\n\t" \
02508 "prefetchnta %5\n\t"
02509 :
02510 : "m" (dpX[1]),
02511 "m" (dpX[2]),
02512 "m" (dpY[0]),
02513 "m" (dpY[1]),
02514 "m" (dpCoeff[0]),
02515 "m" (dpCoeff[3])
02516 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02517 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02518 {
02519 X86_ASM (
02520 "movsd %%xmm1, %%xmm0\n\t" \
02521 "movsd %%xmm2, %%xmm1\n\t" \
02522 "movsd %1, %%xmm2\n\t" \
02523 \
02524 "movsd %2, %%xmm5\n\t" \
02525 "mulsd %%xmm2, %%xmm5\n\t" \
02526 "movsd %3, %%xmm6\n\t" \
02527 "mulsd %%xmm1, %%xmm6\n\t" \
02528 "addsd %%xmm6, %%xmm5\n\t" \
02529 "movsd %4, %%xmm6\n\t" \
02530 "mulsd %%xmm0, %%xmm6\n\t" \
02531 "addsd %%xmm6, %%xmm5\n\t" \
02532 \
02533 "movsd %5, %%xmm6\n\t" \
02534 "mulsd %%xmm4, %%xmm6\n\t" \
02535 "movsd %6, %%xmm7\n\t" \
02536 "mulsd %%xmm3, %%xmm7\n\t" \
02537 "addsd %%xmm7, %%xmm6\n\t" \
02538 \
02539 "addsd %%xmm5, %%xmm6\n\t" \
02540 "movsd %%xmm4, %%xmm3\n\t" \
02541 "movsd %%xmm6, %%xmm4\n\t" \
02542 \
02543 "movsd %%xmm6, %0\n\t"
02544 : "=m" (dpDest[iDataCntr])
02545 : "m" (dpSrc[iDataCntr]),
02546 "m" (dpCoeff[0]),
02547 "m" (dpCoeff[1]),
02548 "m" (dpCoeff[2]),
02549 "m" (dpCoeff[3]),
02550 "m" (dpCoeff[4])
02551 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02552 "memory");
02553 }
02554 X86_ASM (
02555 "movsd %%xmm0, %0\n\t" \
02556 "movsd %%xmm1, %1\n\t" \
02557 "movsd %%xmm2, %2\n\t" \
02558 "movsd %%xmm3, %3\n\t" \
02559 "movsd %%xmm4, %4\n\t"
02560 : "=m" (dpX[0]),
02561 "=m" (dpX[1]),
02562 "=m" (dpX[2]),
02563 "=m" (dpY[0]),
02564 "=m" (dpY[1])
02565 :
02566 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02567 }
02568
02569
02570 #ifdef __cplusplus
02571 }
02572 #endif
02573
02574 #endif