00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifdef DSP_X86
00024
00025
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include <limits.h>
00029 #include <math.h>
00030 #include <float.h>
00031
00032 #include "dsp/X86.h"
00033
00034
00035 static char cpCPUid[13];
00036
00037
00038 #ifdef __cplusplus
00039 extern "C"
00040 {
00041 #endif
00042
00043
00044 const char *dsp_x86_cpuid ()
00045 {
00046 unsigned int *ipCPUid = (unsigned int *) cpCPUid;
00047
00048 X86_ASM (
00049 "pushl %%eax\n\t" \
00050 "pushl %%ebx\n\t" \
00051 "pushl %%ecx\n\t" \
00052 "pushl %%edx\n\t" \
00053 "xorl %%eax, %%eax\n\t" \
00054 "cpuid\n\t" \
00055 "movl %%ebx, %0\n\t" \
00056 "movl %%ecx, %2\n\t" \
00057 "movl %%edx, %1\n\t" \
00058 "popl %%edx\n\t" \
00059 "popl %%ecx\n\t" \
00060 "popl %%ebx\n\t" \
00061 "popl %%eax\n\t"
00062 : "=m" (ipCPUid[0]),
00063 "=m" (ipCPUid[1]),
00064 "=m" (ipCPUid[2])
00065 :
00066 : "eax", "ebx", "ecx", "edx", "memory");
00067 cpCPUid[12] = '\0';
00068
00069 return cpCPUid;
00070 }
00071
00072
00073 unsigned int dsp_x86_features ()
00074 {
00075 unsigned int uiFeatures = 0;
00076
00077 X86_ASM (
00078 "pushl %%eax\n\t" \
00079 "pushl %%ebx\n\t" \
00080 "pushl %%ecx\n\t" \
00081 "pushl %%edx\n\t" \
00082 "movl $1, %%eax\n\t" \
00083 "cpuid\n\t" \
00084 "movl %%edx, %0\n\t" \
00085 "popl %%edx\n\t" \
00086 "popl %%ecx\n\t" \
00087 "popl %%ebx\n\t" \
00088 "popl %%eax\n\t"
00089 : "=m" (uiFeatures)
00090 :
00091 : "eax", "ebx", "ecx", "edx", "memory");
00092
00093 return uiFeatures;
00094 }
00095
00096
00097 unsigned int dsp_x86_amd_features ()
00098 {
00099 unsigned int uiFunction = 0x80000001;
00100 unsigned int uiFeatures = 0;
00101
00102 X86_ASM (
00103 "pushl %%eax\n\t" \
00104 "pushl %%ebx\n\t" \
00105 "pushl %%ecx\n\t" \
00106 "pushl %%edx\n\t" \
00107 "movl %1, %%eax\n\t" \
00108 "cpuid\n\t" \
00109 "movl %%edx, %0\n\t" \
00110 "popl %%edx\n\t" \
00111 "popl %%ecx\n\t" \
00112 "popl %%ebx\n\t" \
00113 "popl %%eax\n\t"
00114 : "=m" (uiFeatures)
00115 : "m" (uiFunction)
00116 : "eax", "ebx", "ecx", "edx", "memory");
00117
00118 return uiFeatures;
00119 }
00120
00121
00122 extern int dsp_x86_have_e3dnow ()
00123 {
00124 unsigned int uiFeatures;
00125
00126 if (strcmp(dsp_x86_cpuid(), "AuthenticAMD") == 0)
00127 {
00128 uiFeatures = dsp_x86_amd_features();
00129 if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
00130 return 1;
00131 }
00132 return 0;
00133 }
00134
00135
00136 extern int dsp_x86_have_sse2 ()
00137 {
00138 unsigned int uiFeatures;
00139
00140 uiFeatures = dsp_x86_features();
00141 if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
00142 return 1;
00143 return 0;
00144 }
00145
00146
00147 void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
00148 {
00149 int iStartIdx;
00150 int iDataCntr;
00151 int iDataCount;
00152 stpm64 m64pDest = (stpm64) fpDest;
00153 stpm64 m64pSrc = (stpm64) fpSrc;
00154
00155 iStartIdx = 0;
00156
00157 X86_ASM (
00158 "prefetchnta %0\n\t" \
00159 "prefetchnta %1\n\t" \
00160 "prefetchnta %2\n\t" \
00161 "prefetchnta %3\n\t"
00162 :
00163 : "m" (m64pSrc[0]),
00164 "m" (m64pSrc[8]),
00165 "m" (m64pSrc[16]),
00166 "m" (m64pSrc[24]));
00167
00168 iDataCount = ((iDataLength & 0xfffffff0) >> 1);
00169 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00170 {
00171
00172 X86_ASM (
00173 "prefetchnta %16\n\t" \
00174 "movq %8, %%mm0\n\t" \
00175 "movq %9, %%mm1\n\t" \
00176 "movq %10, %%mm2\n\t" \
00177 "movq %11, %%mm3\n\t" \
00178 "movq %12, %%mm4\n\t" \
00179 "movq %13, %%mm5\n\t" \
00180 "movq %14, %%mm6\n\t" \
00181 "movq %15, %%mm7\n\t" \
00182 "movntq %%mm0, %0\n\t" \
00183 "movntq %%mm1, %1\n\t" \
00184 "movntq %%mm2, %2\n\t" \
00185 "movntq %%mm3, %3\n\t" \
00186 "movntq %%mm4, %4\n\t" \
00187 "movntq %%mm5, %5\n\t" \
00188 "movntq %%mm6, %6\n\t" \
00189 "movntq %%mm7, %7\n\t"
00190 : "=m" (m64pDest[iDataCntr]),
00191 "=m" (m64pDest[iDataCntr + 1]),
00192 "=m" (m64pDest[iDataCntr + 2]),
00193 "=m" (m64pDest[iDataCntr + 3]),
00194 "=m" (m64pDest[iDataCntr + 4]),
00195 "=m" (m64pDest[iDataCntr + 5]),
00196 "=m" (m64pDest[iDataCntr + 6]),
00197 "=m" (m64pDest[iDataCntr + 7])
00198 : "m" (m64pSrc[iDataCntr]),
00199 "m" (m64pSrc[iDataCntr + 1]),
00200 "m" (m64pSrc[iDataCntr + 2]),
00201 "m" (m64pSrc[iDataCntr + 3]),
00202 "m" (m64pSrc[iDataCntr + 4]),
00203 "m" (m64pSrc[iDataCntr + 5]),
00204 "m" (m64pSrc[iDataCntr + 6]),
00205 "m" (m64pSrc[iDataCntr + 7]),
00206 "m" (m64pSrc[iDataCntr + 32])
00207 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245 }
00246 iStartIdx = iDataCount;
00247 iDataCount = ((iDataLength & 0xfffffffe) >> 1);
00248 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00249 {
00250
00251 X86_ASM (
00252 "prefetchnta %2\n\t" \
00253 "movq %1, %%mm0\n\t" \
00254 "movntq %%mm0, %0\n\t"
00255 : "=m" (m64pDest[iDataCntr])
00256 : "m" (m64pSrc[iDataCntr]),
00257 "m" (m64pSrc[iDataCntr + 32])
00258 : "mm0", "memory");
00259
00260
00261
00262
00263
00264
00265
00266
00267 }
00268 if (iDataLength & 0x1)
00269 {
00270 X86_ASM (
00271 "movd %1, %%mm0\n\t" \
00272 "movd %%mm0, %0\n\t"
00273 : "=m" (fpDest[iDataLength - 1])
00274 : "m" (fpSrc[iDataLength - 1])
00275 : "mm0", "memory");
00276 }
00277 X86_ASM (
00278 "femms\n\t" \
00279 "sfence\n\t");
00280 }
00281
00282
00283 void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
00284 {
00285 int iStartIdx;
00286 int iDataCntr;
00287 int iDataCount;
00288
00289 iStartIdx = 0;
00290
00291 X86_ASM (
00292 "prefetchnta %0\n\t" \
00293 "prefetchnta %1\n\t" \
00294 "prefetchnta %2\n\t" \
00295 "prefetchnta %3\n\t"
00296 :
00297 : "m" (dpSrc[0]),
00298 "m" (dpSrc[8]),
00299 "m" (dpSrc[16]),
00300 "m" (dpSrc[24]));
00301
00302 iDataCount = (iDataLength & 0xfffffff8);
00303 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00304 {
00305
00306 X86_ASM (
00307 "prefetchnta %16\n\t" \
00308 "movq %8, %%mm0\n\t" \
00309 "movq %9, %%mm1\n\t" \
00310 "movq %10, %%mm2\n\t" \
00311 "movq %11, %%mm3\n\t" \
00312 "movq %12, %%mm4\n\t" \
00313 "movq %13, %%mm5\n\t" \
00314 "movq %14, %%mm6\n\t" \
00315 "movq %15, %%mm7\n\t" \
00316 "movntq %%mm0, %0\n\t" \
00317 "movntq %%mm1, %1\n\t" \
00318 "movntq %%mm2, %2\n\t" \
00319 "movntq %%mm3, %3\n\t" \
00320 "movntq %%mm4, %4\n\t" \
00321 "movntq %%mm5, %5\n\t" \
00322 "movntq %%mm6, %6\n\t" \
00323 "movntq %%mm7, %7\n\t"
00324 : "=m" (dpDest[iDataCntr]),
00325 "=m" (dpDest[iDataCntr + 1]),
00326 "=m" (dpDest[iDataCntr + 2]),
00327 "=m" (dpDest[iDataCntr + 3]),
00328 "=m" (dpDest[iDataCntr + 4]),
00329 "=m" (dpDest[iDataCntr + 5]),
00330 "=m" (dpDest[iDataCntr + 6]),
00331 "=m" (dpDest[iDataCntr + 7])
00332 : "m" (dpSrc[iDataCntr]),
00333 "m" (dpSrc[iDataCntr + 1]),
00334 "m" (dpSrc[iDataCntr + 2]),
00335 "m" (dpSrc[iDataCntr + 3]),
00336 "m" (dpSrc[iDataCntr + 4]),
00337 "m" (dpSrc[iDataCntr + 5]),
00338 "m" (dpSrc[iDataCntr + 6]),
00339 "m" (dpSrc[iDataCntr + 7]),
00340 "m" (dpSrc[iDataCntr + 32])
00341 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379 }
00380 iStartIdx = iDataCount;
00381 iDataCount = iDataLength;
00382 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00383 {
00384
00385 X86_ASM (
00386 "prefetchnta %2\n\t" \
00387 "movq %1, %%mm0\n\t" \
00388 "movntq %%mm0, %0\n\t"
00389 : "=m" (dpDest[iDataCntr])
00390 : "m" (dpSrc[iDataCntr]),
00391 "m" (dpSrc[iDataCntr + 32])
00392 : "mm0", "memory");
00393
00394
00395
00396
00397
00398
00399
00400
00401 }
00402 X86_ASM (
00403 "femms\n\t" \
00404 "sfence\n\t");
00405 }
00406
00407
00408 void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
00409 {
00410 int iDataCntr;
00411 int iDataCount;
00412 stpm64 m64pVect = (stpm64) fpVect;
00413 stm64 m64Src;
00414
00415 m64Src.f[0] = m64Src.f[1] = fSrc;
00416 iDataCount = (iDataLength >> 1);
00417 X86_ASM (
00418 "movq %0, %%mm1\n\t"
00419 :
00420 : "m" (m64Src)
00421 : "mm1", "memory");
00422 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00423 {
00424 X86_ASM (
00425 "movq %1, %%mm0\n\t" \
00426 "pfadd %%mm1, %%mm0\n\t" \
00427 "movntq %%mm0, %0\n\t"
00428 : "=m" (m64pVect[iDataCntr])
00429 : "0" (m64pVect[iDataCntr])
00430 : "mm0", "mm1", "memory");
00431 }
00432 if (iDataLength & 0x1)
00433 {
00434 X86_ASM (
00435 "movd %1, %%mm0\n\t" \
00436 "pfadd %%mm1, %%mm0\n\t" \
00437 "movd %%mm0, %0\n\t"
00438 : "=m" (fpVect[iDataLength - 1])
00439 : "0" (fpVect[iDataLength - 1])
00440 : "mm0", "mm1", "memory");
00441 }
00442 X86_ASM (
00443 "femms\n\t" \
00444 "sfence\n\t");
00445 }
00446
00447
00448 void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
00449 {
00450 int iDataCntr;
00451 int iDataCount;
00452 stpm64 m64pVect = (stpm64) fpVect;
00453 stm64 m64Src;
00454
00455 m64Src.f[0] = m64Src.f[1] = fSrc;
00456 iDataCount = (iDataLength >> 1);
00457 X86_ASM (
00458 "movq %0, %%mm1\n\t"
00459 :
00460 : "m" (m64Src)
00461 : "mm1", "memory");
00462 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00463 {
00464 X86_ASM (
00465 "movq %1, %%mm0\n\t" \
00466 "pfmul %%mm1, %%mm0\n\t" \
00467 "movntq %%mm0, %0\n\t"
00468 : "=m" (m64pVect[iDataCntr])
00469 : "0" (m64pVect[iDataCntr])
00470 : "mm0", "mm1", "memory");
00471 }
00472 if (iDataLength & 0x1)
00473 {
00474 X86_ASM (
00475 "movd %1, %%mm0\n\t" \
00476 "pfmul %%mm1, %%mm0\n\t" \
00477 "movd %%mm0, %0\n\t"
00478 : "=m" (fpVect[iDataLength - 1])
00479 : "0" (fpVect[iDataLength - 1])
00480 : "mm0", "mm1", "memory");
00481 }
00482 X86_ASM (
00483 "femms\n\t" \
00484 "sfence\n\t");
00485 }
00486
00487
00488 void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
00489 int iDataLength)
00490 {
00491 int iDataCntr;
00492 int iDataCount;
00493 stpm64 m64pDest = (stpm64) fpDest;
00494 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00495 stm64 m64Src2;
00496
00497 m64Src2.f[0] = m64Src2.f[1] = fSrc2;
00498 iDataCount = (iDataLength >> 1);
00499 X86_ASM (
00500 "movq %0, %%mm1\n\t"
00501 :
00502 : "m" (m64Src2)
00503 : "mm1", "memory");
00504 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00505 {
00506 X86_ASM (
00507 "movq %1, %%mm0\n\t" \
00508 "pfmul %%mm1, %%mm0\n\t" \
00509 "movntq %%mm0, %0\n\t"
00510 : "=m" (m64pDest[iDataCntr])
00511 : "m" (m64pSrc1[iDataCntr])
00512 : "mm0", "mm1", "memory");
00513 }
00514 if (iDataLength & 0x1)
00515 {
00516 X86_ASM (
00517 "movd %1, %%mm0\n\t" \
00518 "pfmul %%mm1, %%mm0\n\t" \
00519 "movd %%mm0, %0\n\t"
00520 : "=m" (fpDest[iDataLength - 1])
00521 : "m" (fpSrc1[iDataLength - 1])
00522 : "mm0", "mm1", "memory");
00523 }
00524 X86_ASM (
00525 "femms\n\t" \
00526 "sfence\n\t");
00527 }
00528
00529
00530 void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00531 {
00532 int iDataCntr;
00533 int iDataCount;
00534 stpm64 m64pDest = (stpm64) fpDest;
00535 stpm64 m64pSrc = (stpm64) fpSrc;
00536
00537 iDataCount = (iDataLength >> 1);
00538 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00539 {
00540 X86_ASM (
00541 "movq %1, %%mm0\n\t" \
00542 "movq %2, %%mm1\n\t" \
00543 "pfadd %%mm1, %%mm0\n\t" \
00544 "movntq %%mm0, %0\n\t"
00545 : "=m" (m64pDest[iDataCntr])
00546 : "0" (m64pDest[iDataCntr]),
00547 "m" (m64pSrc[iDataCntr])
00548 : "mm0", "mm1", "memory");
00549 }
00550 if (iDataLength & 0x1)
00551 {
00552 X86_ASM (
00553 "movd %1, %%mm0\n\t" \
00554 "movd %2, %%mm1\n\t" \
00555 "pfadd %%mm1, %%mm0\n\t" \
00556 "movd %%mm0, %0\n\t"
00557 : "=m" (fpDest[iDataLength - 1])
00558 : "0" (fpDest[iDataLength - 1]),
00559 "m" (fpSrc[iDataLength - 1])
00560 : "mm0", "mm1", "memory");
00561 }
00562 X86_ASM (
00563 "femms\n\t" \
00564 "sfence\n\t");
00565 }
00566
00567
00568 void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00569 {
00570 int iDataCntr;
00571 int iDataCount;
00572 stpm64 m64pDest = (stpm64) fpDest;
00573 stpm64 m64pSrc = (stpm64) fpSrc;
00574
00575 iDataCount = (iDataLength >> 1);
00576 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00577 {
00578 X86_ASM (
00579 "movq %1, %%mm0\n\t" \
00580 "movq %2, %%mm1\n\t" \
00581 "pfmul %%mm1, %%mm0\n\t" \
00582 "movntq %%mm0, %0\n\t"
00583 : "=m" (m64pDest[iDataCntr])
00584 : "0" (m64pDest[iDataCntr]),
00585 "m" (m64pSrc[iDataCntr])
00586 : "mm0", "mm1", "memory");
00587 }
00588 if (iDataLength & 0x1)
00589 {
00590 X86_ASM (
00591 "movd %1, %%mm0\n\t" \
00592 "movd %2, %%mm1\n\t" \
00593 "pfmul %%mm1, %%mm0\n\t" \
00594 "movd %%mm0, %0\n\t"
00595 : "=m" (fpDest[iDataLength - 1])
00596 : "0" (fpDest[iDataLength - 1]),
00597 "m" (fpSrc[iDataLength - 1])
00598 : "mm0", "mm1", "memory");
00599 }
00600 X86_ASM (
00601 "femms\n\t" \
00602 "sfence\n\t");
00603 }
00604
00605
00606 void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1,
00607 const float *fpSrc2, int iDataLength)
00608 {
00609 int iDataCntr;
00610 int iDataCount;
00611 stpm64 m64pDest = (stpm64) fpDest;
00612 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00613 stpm64 m64pSrc2 = (stpm64) fpSrc2;
00614
00615 iDataCount = (iDataLength >> 1);
00616 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00617 {
00618 X86_ASM (
00619 "movq %1, %%mm0\n\t" \
00620 "movq %2, %%mm1\n\t" \
00621 "pfadd %%mm1, %%mm0\n\t" \
00622 "movntq %%mm0, %0\n\t"
00623 : "=m" (m64pDest[iDataCntr])
00624 : "m" (m64pSrc1[iDataCntr]),
00625 "m" (m64pSrc2[iDataCntr])
00626 : "mm0", "mm1", "memory");
00627 }
00628 if (iDataLength & 0x1)
00629 {
00630 X86_ASM (
00631 "movd %1, %%mm0\n\t" \
00632 "movd %2, %%mm1\n\t" \
00633 "pfadd %%mm1, %%mm0\n\t" \
00634 "movd %%mm0, %0\n\t"
00635 : "=m" (fpDest[iDataLength - 1])
00636 : "m" (fpSrc1[iDataLength - 1]),
00637 "m" (fpSrc2[iDataLength - 1])
00638 : "mm0", "mm1", "memory");
00639 }
00640 X86_ASM (
00641 "femms\n\t" \
00642 "sfence\n\t");
00643 }
00644
00645
00646 void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1,
00647 const float *fpSrc2, int iDataLength)
00648 {
00649 int iDataCntr;
00650 int iDataCount;
00651 stpm64 m64pDest = (stpm64) fpDest;
00652 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00653 stpm64 m64pSrc2 = (stpm64) fpSrc2;
00654
00655 iDataCount = (iDataLength >> 1);
00656 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00657 {
00658 X86_ASM (
00659 "movq %1, %%mm0\n\t" \
00660 "movq %2, %%mm1\n\t" \
00661 "pfmul %%mm1, %%mm0\n\t" \
00662 "movntq %%mm0, %0\n\t"
00663 : "=m" (m64pDest[iDataCntr])
00664 : "m" (m64pSrc1[iDataCntr]),
00665 "m" (m64pSrc2[iDataCntr])
00666 : "mm0", "mm1", "memory");
00667 }
00668 if (iDataLength & 0x1)
00669 {
00670 X86_ASM (
00671 "movd %1, %%mm0\n\t" \
00672 "movd %2, %%mm1\n\t" \
00673 "pfmul %%mm1, %%mm0\n\t" \
00674 "movd %%mm0, %0\n\t"
00675 : "=m" (fpDest[iDataLength - 1])
00676 : "m" (fpSrc1[iDataLength - 1]),
00677 "m" (fpSrc2[iDataLength - 1])
00678 : "mm0", "mm1", "memory");
00679 }
00680 X86_ASM (
00681 "femms\n\t" \
00682 "sfence\n\t");
00683 }
00684
00685
00686 void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00687 {
00688 int iDataCntr;
00689 stpm64 m64pDest = (stpm64) fpDest;
00690
00691 X86_ASM (
00692 "movq %0, %%mm3\n\t"
00693 :
00694 : "m" (fpSrc)
00695 : "mm3", "memory");
00696 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00697 {
00698 X86_ASM (
00699 "movq %1, %%mm0\n\t" \
00700 "movq %%mm3, %%mm1\n\t" \
00701 "pswapd %%mm1, %%mm2\n\t" \
00702 "pfmul %%mm0, %%mm1\n\t" \
00703 "pfmul %%mm0, %%mm2\n\t" \
00704 "pfpnacc %%mm2, %%mm1\n\t"
00705 "movntq %%mm1, %0\n\t"
00706 : "=m" (m64pDest[iDataCntr])
00707 : "0" (m64pDest[iDataCntr])
00708 : "mm0", "mm1", "mm2", "mm3", "memory");
00709 }
00710 X86_ASM (
00711 "femms\n\t" \
00712 "sfence\n\t");
00713 }
00714
00715
00716 void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
00717 {
00718 int iDataCntr;
00719 stpm64 m64pDest = (stpm64) fpDest;
00720 stpm64 m64pSrc = (stpm64) fpSrc;
00721
00722 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00723 {
00724 X86_ASM (
00725 "movq %1, %%mm0\n\t" \
00726 "movq %2, %%mm1\n\t" \
00727 "pswapd %%mm1, %%mm2\n\t" \
00728 "pfmul %%mm0, %%mm1\n\t" \
00729 "pfmul %%mm0, %%mm2\n\t" \
00730 "pfpnacc %%mm2, %%mm1\n\t"
00731 "movntq %%mm1, %0\n\t"
00732 : "=m" (m64pDest[iDataCntr])
00733 : "0" (m64pDest[iDataCntr]),
00734 "m" (m64pSrc[iDataCntr])
00735 : "mm0", "mm1", "mm2", "memory");
00736 }
00737 X86_ASM (
00738 "femms\n\t" \
00739 "sfence\n\t");
00740 }
00741
00742
00743 void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1,
00744 const float *fpSrc2, int iDataLength)
00745 {
00746 int iDataCntr;
00747 stpm64 m64pDest = (stpm64) fpDest;
00748 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00749 stpm64 m64pSrc2 = (stpm64) fpSrc2;
00750
00751 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00752 {
00753 X86_ASM (
00754 "movq %1, %%mm0\n\t" \
00755 "movq %2, %%mm1\n\t" \
00756 "pswapd %%mm1, %%mm2\n\t" \
00757 "pfmul %%mm0, %%mm1\n\t" \
00758 "pfmul %%mm0, %%mm2\n\t" \
00759 "pfpnacc %%mm2, %%mm1\n\t"
00760 "movntq %%mm1, %0\n\t"
00761 : "=m" (m64pDest[iDataCntr])
00762 : "m" (m64pSrc1[iDataCntr]),
00763 "m" (m64pSrc2[iDataCntr])
00764 : "mm0", "mm1", "mm2", "memory");
00765 }
00766 X86_ASM (
00767 "femms\n\t" \
00768 "sfence\n\t");
00769 }
00770
00771
00772 void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
00773 {
00774 int iDataCntr;
00775 int iDataCount;
00776 stpm64 m64pVect = (stpm64) fpVect;
00777 stm64 m64Mul;
00778 stm64 m64Add;
00779
00780 m64Mul.f[0] = m64Mul.f[1] = fMul;
00781 m64Add.f[0] = m64Add.f[1] = fAdd;
00782 iDataCount = (iDataLength >> 1);
00783 X86_ASM (
00784 "movq %0, %%mm1\n\t" \
00785 "movq %1, %%mm2\n\t"
00786 :
00787 : "m" (m64Mul),
00788 "m" (m64Add)
00789 : "mm1", "mm2", "memory");
00790
00791
00792
00793
00794
00795
00796
00797
00798
00799
00800
00801 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00802 {
00803 X86_ASM (
00804 "movq %1, %%mm0\n\t" \
00805 "pfmul %%mm1, %%mm0\n\t" \
00806 "pfadd %%mm2, %%mm0\n\t" \
00807 "movntq %%mm0, %0\n\t"
00808 : "=m" (m64pVect[iDataCntr])
00809 : "0" (m64pVect[iDataCntr])
00810 : "mm0", "mm1", "mm2", "memory");
00811 }
00812 if (iDataLength & 0x1)
00813 {
00814 X86_ASM (
00815 "movd %1, %%mm0\n\t" \
00816 "pfmul %%mm1, %%mm0\n\t" \
00817 "pfadd %%mm2, %%mm0\n\t" \
00818 "movd %%mm0, %0\n\t"
00819 : "=m" (fpVect[iDataLength - 1])
00820 : "0" (fpVect[iDataLength - 1])
00821 : "mm0", "mm1", "mm2", "memory");
00822 }
00823 X86_ASM (
00824 "femms\n\t" \
00825 "sfence\n\t");
00826 }
00827
00828
00829 void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
00830 float fMul, float fAdd, int iDataLength)
00831 {
00832 int iDataCntr;
00833 int iDataCount;
00834 stpm64 m64pDest = (stpm64) fpDest;
00835 stpm64 m64pSrc = (stpm64) fpSrc;
00836 stm64 m64Mul;
00837 stm64 m64Add;
00838
00839 m64Mul.f[0] = m64Mul.f[1] = fMul;
00840 m64Add.f[0] = m64Add.f[1] = fAdd;
00841 iDataCount = (iDataLength >> 1);
00842 X86_ASM (
00843 "movq %0, %%mm1\n\t" \
00844 "movq %1, %%mm2\n\t"
00845 :
00846 : "m" (m64Mul),
00847 "m" (m64Add)
00848 : "mm1", "mm2", "memory");
00849 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00850 {
00851 X86_ASM (
00852 "movq %1, %%mm0\n\t" \
00853 "pfmul %%mm1, %%mm0\n\t" \
00854 "pfadd %%mm2, %%mm0\n\t" \
00855 "movntq %%mm0, %0\n\t"
00856 : "=m" (m64pDest[iDataCntr])
00857 : "m" (m64pSrc[iDataCntr])
00858 : "mm0", "mm1", "mm2", "memory");
00859 }
00860 if (iDataLength & 0x1)
00861 {
00862 X86_ASM (
00863 "movd %1, %%mm0\n\t" \
00864 "pfmul %%mm1, %%mm0\n\t" \
00865 "pfadd %%mm2, %%mm0\n\t" \
00866 "movd %%mm0, %0\n\t"
00867 : "=m" (fpDest[iDataLength - 1])
00868 : "m" (fpSrc[iDataLength - 1])
00869 : "mm0", "mm1", "mm2", "memory");
00870 }
00871 X86_ASM (
00872 "femms\n\t" \
00873 "sfence\n\t");
00874 }
00875
00876
00877 void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
00878 {
00879 int iDataCntr;
00880 int iDataCount;
00881 stpm64 m64pVect = (stpm64) fpVect;
00882 stm64 m64Add;
00883 stm64 m64Mul;
00884
00885 m64Add.f[0] = m64Add.f[1] = fAdd;
00886 m64Mul.f[0] = m64Mul.f[1] = fMul;
00887 iDataCount = (iDataLength >> 1);
00888 X86_ASM (
00889 "movq %0, %%mm1\n\t" \
00890 "movq %1, %%mm2\n\t"
00891 :
00892 : "m" (m64Add),
00893 "m" (m64Mul)
00894 : "mm1", "mm2", "memory");
00895 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00896 {
00897 X86_ASM (
00898 "movq %1, %%mm0\n\t" \
00899 "pfadd %%mm1, %%mm0\n\t" \
00900 "pfmul %%mm2, %%mm0\n\t" \
00901 "movntq %%mm0, %0\n\t"
00902 : "=m" (m64pVect[iDataCntr])
00903 : "0" (m64pVect[iDataCntr])
00904 : "mm0", "mm1", "mm2", "memory");
00905 }
00906 if (iDataLength & 0x1)
00907 {
00908 X86_ASM (
00909 "movd %1, %%mm0\n\t" \
00910 "pfadd %%mm1, %%mm0\n\t" \
00911 "pfmul %%mm2, %%mm0\n\t" \
00912 "movd %%mm0, %0\n\t"
00913 : "=m" (fpVect[iDataLength - 1])
00914 : "0" (fpVect[iDataLength - 1])
00915 : "mm0", "mm1", "mm2", "memory");
00916 }
00917 X86_ASM (
00918 "femms\n\t" \
00919 "sfence\n\t");
00920 }
00921
00922
00923 float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2,
00924 int iDataLength)
00925 {
00926 int iDataCntr;
00927 int iDataCount;
00928 float fRes;
00929 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00930 stpm64 m64pSrc2 = (stpm64) fpSrc2;
00931
00932 iDataCount = (iDataLength >> 1);
00933 X86_ASM (
00934 "pxor %%mm0, %%mm0\n\t"
00935 :
00936 :
00937 : "mm0");
00938 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00939 {
00940 X86_ASM (
00941 "movq %0, %%mm1\n\t" \
00942 "movq %1, %%mm2\n\t" \
00943 "pfmul %%mm2, %%mm1\n\t" \
00944 "pfacc %%mm1, %%mm0\n\t"
00945 :
00946 : "m" (m64pSrc1[iDataCntr]),
00947 "m" (m64pSrc2[iDataCntr])
00948 : "mm0", "mm1", "mm2", "memory");
00949 }
00950 if (iDataLength & 0x1)
00951 {
00952 X86_ASM (
00953 "movd %0, %%mm1\n\t" \
00954 "movd %1, %%mm2\n\t" \
00955 "pfmul %%mm2, %%mm1\n\t" \
00956 "pfacc %%mm1, %%mm0\n\t"
00957 :
00958 : "m" (fpSrc1[iDataLength - 1]),
00959 "m" (fpSrc2[iDataLength - 1])
00960 : "mm0", "mm1", "mm2", "memory");
00961 }
00962 X86_ASM (
00963 "pfacc %%mm0, %%mm0\n\t" \
00964 "movd %%mm0, %0\n\t"
00965 : "=m" (fRes)
00966 :
00967 : "mm0", "memory");
00968 X86_ASM ("femms\n\t");
00969
00970 return fRes;
00971 }
00972
00973
00974 void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
00975 int iDataLength)
00976 {
00977 int iDataCntr;
00978 int iDataCount;
00979 stm64 m64Min;
00980 stm64 m64Max;
00981 stpm64 m64pSrc = (stpm64) fpSrc;
00982
00983 m64Min.f[0] = m64Min.f[1] = FLT_MAX;
00984 m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
00985 iDataCount = (iDataLength >> 1);
00986 X86_ASM (
00987 "movq %0, %%mm1\n\t" \
00988 "movq %1, %%mm2\n\t"
00989 :
00990 : "m" (m64Min),
00991 "m" (m64Max)
00992 : "mm1", "mm2", "memory");
00993 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00994 {
00995 X86_ASM (
00996 "movq %0, %%mm0\n\t" \
00997 "pfmin %%mm0, %%mm1\n\t" \
00998 "pfmax %%mm0, %%mm2\n\t"
00999 :
01000 : "m" (m64pSrc[iDataCntr])
01001 : "mm0", "mm1", "mm2", "memory");
01002 }
01003 if (iDataLength & 0x1)
01004 {
01005 X86_ASM (
01006 "movd %0, %%mm0\n\t" \
01007 "pfmin %%mm0, %%mm1\n\t" \
01008 "pfmax %%mm0, %%mm2\n\t"
01009 :
01010 : "m" (fpSrc[iDataLength - 1])
01011 : "mm0", "mm1", "mm2", "memory");
01012 }
01013 X86_ASM (
01014 "pswapd %%mm1, %%mm3\n\t" \
01015 "pfmin %%mm3, %%mm1\n\t" \
01016 "pswapd %%mm2, %%mm3\n\t" \
01017 "pfmax %%mm3, %%mm2\n\t" \
01018 "movd %%mm1, %0\n\t" \
01019 "movd %%mm2, %1\n\t"
01020 : "=m" (*fpMin),
01021 "=m" (*fpMax)
01022 :
01023 : "mm1", "mm2", "mm3", "memory");
01024 X86_ASM ("femms\n\t");
01025 }
01026
01027
01028 float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01029 int iDataLength)
01030 {
01031 int iDataCntr;
01032 int iDataCount;
01033 float fRes;
01034 stpm64 m64pSrc1 = (stpm64) fpSrc1;
01035 stpm64 m64pSrc2 = (stpm64) fpSrc2;
01036
01037 iDataCount = (iDataLength >> 1);
01038 X86_ASM (
01039 "pxor %%mm3, %%mm3\n\t" \
01040 "pxor %%mm4, %%mm4\n\t" \
01041 "pxor %%mm5, %%mm5\n\t"
01042 :
01043 :
01044 : "mm3", "mm4", "mm5");
01045 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01046 {
01047 X86_ASM (
01048 "movq %0, %%mm0\n\t" \
01049 "movq %1, %%mm1\n\t" \
01050 "movq %%mm1, %%mm2\n\t" \
01051 "pfmul %%mm0, %%mm2\n\t" \
01052 "pfacc %%mm2, %%mm5\n\t" \
01053 "pfmul %%mm0, %%mm0\n\t" \
01054 "pfacc %%mm0, %%mm3\n\t" \
01055 "pfmul %%mm1, %%mm1\n\t" \
01056 "pfacc %%mm1, %%mm4\n\t"
01057 :
01058 : "m" (m64pSrc1[iDataCntr]),
01059 "m" (m64pSrc2[iDataCntr])
01060 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01061 }
01062 if (iDataLength & 0x1)
01063 {
01064 X86_ASM (
01065 "movd %0, %%mm0\n\t" \
01066 "movd %1, %%mm1\n\t" \
01067 "movq %%mm1, %%mm2\n\t" \
01068 "pfmul %%mm0, %%mm2\n\t" \
01069 "pfacc %%mm2, %%mm5\n\t" \
01070 "pfmul %%mm0, %%mm0\n\t" \
01071 "pfacc %%mm0, %%mm3\n\t" \
01072 "pfmul %%mm1, %%mm1\n\t" \
01073 "pfacc %%mm1, %%mm4\n\t"
01074 :
01075 : "m" (fpSrc1[iDataLength - 1]),
01076 "m" (fpSrc2[iDataLength - 1])
01077 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01078 }
01079 X86_ASM (
01080 "pfacc %%mm3, %%mm3\n\t" \
01081 "pfacc %%mm4, %%mm4\n\t" \
01082 "pfacc %%mm5, %%mm5\n\t" \
01083 \
01084 "movd %1, %%mm6\n\t" \
01085 "pswapd %%mm6, %%mm7\n\t" \
01086 "paddd %%mm7, %%mm6\n\t" \
01087 "pi2fd %%mm6, %%mm7\n\t" \
01088 \
01089 "pfrcp %%mm7, %%mm6\n\t" \
01090 "pfrcpit1 %%mm6, %%mm7\n\t" \
01091 "pfrcpit2 %%mm6, %%mm7\n\t" \
01092 \
01093 "pfmul %%mm3, %%mm4\n\t" \
01094 \
01095 "movq %%mm4, %%mm0\n\t" \
01096 "pfrsqrt %%mm4, %%mm1\n\t" \
01097 "movq %%mm1, %%mm2\n\t" \
01098 "pfmul %%mm1, %%mm1\n\t" \
01099 "pfrsqit1 %%mm4, %%mm1\n\t" \
01100 "pfrcpit2 %%mm2, %%mm1\n\t" \
01101 "pfmul %%mm1, %%mm4\n\t" \
01102 \
01103 "pfmul %%mm6, %%mm4\n\t" \
01104 \
01105 "pfrcp %%mm4, %%mm0\n\t" \
01106 "pfrcpit1 %%mm0, %%mm4\n\t" \
01107 "pfrcpit2 %%mm0, %%mm4\n\t" \
01108 \
01109 "pfmul %%mm6, %%mm5\n\t" \
01110 "pfmul %%mm4, %%mm5\n\t" \
01111 "movd %%mm5, %0\n\t"
01112 : "=m" (fRes)
01113 : "m" (iDataLength)
01114 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01115 X86_ASM ("femms\n\t");
01116
01117 return fRes;
01118 }
01119
01120
01121 void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
01122 int iIntMax)
01123 {
01124 int iDataCntr;
01125 float fScale;
01126
01127 X86_ASM (
01128 "movd %1, %%mm1\n\t" \
01129 "pswapd %%mm1, %%mm2\n\t" \
01130 "paddd %%mm2, %%mm1\n\t" \
01131 "pi2fd %%mm1, %%mm1\n\t" \
01132 "pfrcp %%mm1, %%mm2\n\t" \
01133 "pfrcpit1 %%mm2, %%mm1\n\t" \
01134 "pfrcpit2 %%mm2, %%mm1\n\t" \
01135 "movd %%mm1, %0\n\t"
01136 : "=m" (fScale)
01137 : "m" (iIntMax)
01138 : "mm1", "mm2", "memory");
01139 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01140 {
01141 X86_ASM (
01142 "movd %1, %%mm0\n\t" \
01143 "punpcklwd %%mm0, %%mm0\n\t" \
01144 "pi2fw %%mm0, %%mm0\n\t" \
01145 "pfmul %%mm1, %%mm0\n\t" \
01146 "movntq %%mm0, %0\n\t"
01147 : "=m" (fpDest[iDataCntr])
01148 : "m" (ipSrc[iDataCntr])
01149 : "mm0", "mm1", "memory");
01150 }
01151 X86_ASM (
01152 "femms\n\t" \
01153 "sfence\n\t");
01154 if ((iDataLength % 2) != 0)
01155 {
01156 fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01157 }
01158 }
01159
01160
01161 void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
01162 int iIntMax)
01163 {
01164 int iDataCntr;
01165 float fScale;
01166
01167 X86_ASM (
01168 "movd %1, %%mm1\n\t" \
01169 "pswapd %%mm1, %%mm2\n\t" \
01170 "paddd %%mm2, %%mm1\n\t" \
01171 "pi2fd %%mm1, %%mm1\n\t" \
01172 "pfrcp %%mm1, %%mm2\n\t" \
01173 "pfrcpit1 %%mm2, %%mm1\n\t" \
01174 "pfrcpit2 %%mm2, %%mm1\n\t" \
01175 "movd %%mm1, %0\n\t"
01176 : "=m" (fScale)
01177 : "m" (iIntMax)
01178 : "mm1", "mm2", "memory");
01179 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01180 {
01181 X86_ASM (
01182 "movq %1, %%mm0\n\t" \
01183 "pi2fd %%mm0, %%mm0\n\t" \
01184 "pfmul %%mm1, %%mm0\n\t" \
01185 "movntq %%mm0, %0\n\t"
01186 : "=m" (fpDest[iDataCntr])
01187 : "m" (ipSrc[iDataCntr])
01188 : "mm0", "mm1", "memory");
01189 }
01190 X86_ASM (
01191 "femms\n\t" \
01192 "sfence\n\t");
01193 if ((iDataLength % 2) != 0)
01194 {
01195 fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01196 }
01197 }
01198
01199
01200 void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength,
01201 const float *fpCoeff, int iCoeffLength)
01202 {
01203 int iSrcCntr;
01204 int iDestCntr;
01205 int iCoeffCntr;
01206 int iSrcCount;
01207 stpm64 m64pDest = (stpm64) fpDest;
01208
01209 iDestCntr = 0;
01210 iSrcCount = iDataLength + iCoeffLength;
01211 for (iSrcCntr = iCoeffLength;
01212 iSrcCntr < iSrcCount;
01213 iSrcCntr += 2)
01214 {
01215 X86_ASM (
01216 "pxor %%mm0, %%mm0\n\t"
01217 :
01218 :
01219 : "mm0");
01220 for (iCoeffCntr = 0;
01221 iCoeffCntr < iCoeffLength;
01222 iCoeffCntr++)
01223 {
01224 X86_ASM (
01225 "movq %0, %%mm1\n\t" \
01226 "movd %1, %%mm2\n\t" \
01227 "pswapd %%mm2, %%mm3\n\t" \
01228 "pfadd %%mm3, %%mm2\n\t" \
01229 "pfmul %%mm2, %%mm1\n\t" \
01230 "pfadd %%mm1, %%mm0\n\t"
01231 :
01232 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
01233 "m" (fpCoeff[iCoeffCntr])
01234 : "mm0", "mm1", "mm2", "mm3", "memory");
01235 }
01236 X86_ASM (
01237 "movntq %%mm0, %0\n\t"
01238 : "=m" (m64pDest[iDestCntr++])
01239 :
01240 : "mm0", "memory");
01241 }
01242 if (iDataLength & 0x1)
01243 {
01244 X86_ASM (
01245 "pxor %%mm0, %%mm0\n\t"
01246 :
01247 :
01248 : "mm0");
01249 for (iCoeffCntr = 0;
01250 iCoeffCntr < iCoeffLength;
01251 iCoeffCntr++)
01252 {
01253 X86_ASM (
01254 "movd %0, %%mm1\n\t" \
01255 "movd %1, %%mm2\n\t" \
01256 "pfmul %%mm2, %%mm1\n\t" \
01257 "pfadd %%mm1, %%mm0\n\t"
01258 :
01259 : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
01260 "m" (fpCoeff[iCoeffCntr])
01261 : "mm0", "mm1", "mm2", "memory");
01262 }
01263 X86_ASM (
01264 "movd %%mm0, %0\n\t"
01265 : "=m" (fpDest[iDataLength - 1])
01266 :
01267 : "mm0", "memory");
01268 }
01269 X86_ASM (
01270 "femms\n\t" \
01271 "sfence\n\t");
01272 }
01273
01274
01275 void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
01276 float *fpX, float *fpY)
01277 {
01278 int iDataCntr;
01279 stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
01280 stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
01281 stpm64 m64pX = (stpm64) fpX;
01282 stpm64 m64pY = (stpm64) fpY;
01283
01284 X86_ASM (
01285 "movq %0, %%mm0\n\t" \
01286 "pswapd %%mm0, %%mm2\n\t" \
01287 "movd %1, %%mm3\n\t" \
01288 "movq %2, %%mm0\n\t" \
01289 "pswapd %%mm0, %%mm4\n\t" \
01290 "movq %3, %%mm5\n\t" \
01291 "movq %4, %%mm7\n\t" \
01292 :
01293 : "m" (*m64pCoeff),
01294 "m" (fpCoeff[0]),
01295 "m" (*m64pCoeff2),
01296 "m" (*m64pX),
01297 "m" (*m64pY)
01298 : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
01299 for (iDataCntr = 0;
01300 iDataCntr < iDataLength;
01301 iDataCntr++)
01302 {
01303 X86_ASM (
01304 "pxor %%mm0, %%mm0\n\t" \
01305 "movd %1, %%mm6\n\t" \
01306 "movq %%mm5, %%mm1\n\t" \
01307 "pfmul %%mm2, %%mm1\n\t" \
01308 "pfacc %%mm1, %%mm0\n\t" \
01309 "movq %%mm6, %%mm1\n\t" \
01310 "pfmul %%mm3, %%mm1\n\t" \
01311 "pfacc %%mm1, %%mm0\n\t" \
01312 "movq %%mm7, %%mm1\n\t" \
01313 "pfmul %%mm4, %%mm1\n\t" \
01314 "pfacc %%mm1, %%mm0\n\t" \
01315 "pfacc %%mm0, %%mm0\n\t" \
01316 \
01317 "pswapd %%mm7, %%mm1\n\t" \
01318 "movq %%mm1, %%mm7\n\t" \
01319 "punpckldq %%mm0, %%mm7\n\t" \
01320 \
01321 "pswapd %%mm5, %%mm1\n\t" \
01322 "movq %%mm1, %%mm5\n\t" \
01323 "movq %%mm6, %%mm1\n\t" \
01324 "punpckldq %%mm1, %%mm5\n\t" \
01325 \
01326 "movd %%mm0, %0\n\t"
01327 : "=m" (fpVect[iDataCntr])
01328 : "0" (fpVect[iDataCntr])
01329 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01330 }
01331 X86_ASM (
01332 "movq %%mm5, %0\n\t" \
01333 "movd %%mm6, %1\n\t" \
01334 "movq %%mm7, %2\n\t"
01335 : "=m" (*m64pX),
01336 "=m" (fpX[2]),
01337 "=m" (*m64pY)
01338 :
01339 : "mm5", "mm6", "mm7", "memory");
01340 X86_ASM ("femms\n\t");
01341 }
01342
01343
01344 void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
01345 const float *fpCoeff, float *fpX, float *fpY)
01346 {
01347 int iDataCntr;
01348 stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
01349 stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
01350 stpm64 m64pX = (stpm64) fpX;
01351 stpm64 m64pY = (stpm64) fpY;
01352
01353 X86_ASM (
01354 "movq %0, %%mm0\n\t" \
01355 "pswapd %%mm0, %%mm2\n\t" \
01356 "movd %1, %%mm3\n\t" \
01357 "movq %2, %%mm0\n\t" \
01358 "pswapd %%mm0, %%mm4\n\t" \
01359 "movq %3, %%mm5\n\t" \
01360 "movq %4, %%mm7\n\t" \
01361 :
01362 : "m" (*m64pCoeff),
01363 "m" (fpCoeff[0]),
01364 "m" (*m64pCoeff2),
01365 "m" (*m64pX),
01366 "m" (*m64pY)
01367 : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
01368 for (iDataCntr = 0;
01369 iDataCntr < iDataLength;
01370 iDataCntr++)
01371 {
01372 X86_ASM (
01373 "pxor %%mm0, %%mm0\n\t" \
01374 "movd %1, %%mm6\n\t" \
01375 "movq %%mm5, %%mm1\n\t" \
01376 "pfmul %%mm2, %%mm1\n\t" \
01377 "pfacc %%mm1, %%mm0\n\t" \
01378 "movq %%mm6, %%mm1\n\t" \
01379 "pfmul %%mm3, %%mm1\n\t" \
01380 "pfacc %%mm1, %%mm0\n\t" \
01381 "movq %%mm7, %%mm1\n\t" \
01382 "pfmul %%mm4, %%mm1\n\t" \
01383 "pfacc %%mm1, %%mm0\n\t" \
01384 "pfacc %%mm0, %%mm0\n\t" \
01385 \
01386 "pswapd %%mm7, %%mm1\n\t" \
01387 "movq %%mm1, %%mm7\n\t" \
01388 "punpckldq %%mm0, %%mm7\n\t" \
01389 \
01390 "pswapd %%mm5, %%mm1\n\t" \
01391 "movq %%mm1, %%mm5\n\t" \
01392 "movq %%mm6, %%mm1\n\t" \
01393 "punpckldq %%mm1, %%mm5\n\t" \
01394 \
01395 "movd %%mm0, %0\n\t"
01396 : "=m" (fpDest[iDataCntr])
01397 : "m" (fpSrc[iDataCntr])
01398 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01399 }
01400 X86_ASM (
01401 "movq %%mm5, %0\n\t" \
01402 "movd %%mm6, %1\n\t" \
01403 "movq %%mm7, %2\n\t"
01404 : "=m" (*m64pX),
01405 "=m" (fpX[2]),
01406 "=m" (*m64pY)
01407 :
01408 : "mm5", "mm6", "mm7", "memory");
01409 X86_ASM ("femms\n\t");
01410 }
01411
01412
01413 #ifdef __cplusplus
01414 }
01415 #endif
01416
01417 #endif