@@ -39,65 +39,67 @@ size_t count) { OF_ALIGN(16) float tmp[4]; __asm__ __volatile__ ( - "test %0, %0\n\t" + "test %[count], %[count]\n\t" "jz 0f\n" "\n\t" - "movaps (%2), %%xmm0\n\t" - "movaps 16(%2), %%xmm1\n\t" - "movaps 32(%2), %%xmm2\n\t" + "movaps (%[matrix]), %%xmm0\n\t" + "movaps 16(%[matrix]), %%xmm1\n\t" + "movaps 32(%[matrix]), %%xmm2\n\t" # ifdef OF_AMD64 - "movaps 48(%2), %%xmm8\n" + "movaps 48(%[matrix]), %%xmm8\n" # endif "\n\t" "0:\n\t" - "movaps (%1), %%xmm3\n" + "movaps (%[vectors]), %%xmm3\n" "\n\t" "movaps %%xmm0, %%xmm4\n\t" "mulps %%xmm3, %%xmm4\n\t" - "movaps %%xmm4, (%3)\n\t" - "addss 4(%3), %%xmm4\n\t" - "addss 8(%3), %%xmm4\n\t" - "addss 12(%3), %%xmm4\n" + "movaps %%xmm4, (%[tmp])\n\t" + "addss 4(%[tmp]), %%xmm4\n\t" + "addss 8(%[tmp]), %%xmm4\n\t" + "addss 12(%[tmp]), %%xmm4\n" "\n\t" "movaps %%xmm1, %%xmm5\n\t" "mulps %%xmm3, %%xmm5\n\t" - "movaps %%xmm5, (%3)\n\t" - "addss 4(%3), %%xmm5\n\t" - "addss 8(%3), %%xmm5\n\t" - "addss 12(%3), %%xmm5\n" + "movaps %%xmm5, (%[tmp])\n\t" + "addss 4(%[tmp]), %%xmm5\n\t" + "addss 8(%[tmp]), %%xmm5\n\t" + "addss 12(%[tmp]), %%xmm5\n" "\n\t" "movaps %%xmm2, %%xmm6\n\t" "mulps %%xmm3, %%xmm6\n\t" - "movaps %%xmm6, (%3)\n\t" - "addss 4(%3), %%xmm6\n\t" - "addss 8(%3), %%xmm6\n\t" - "addss 12(%3), %%xmm6\n" + "movaps %%xmm6, (%[tmp])\n\t" + "addss 4(%[tmp]), %%xmm6\n\t" + "addss 8(%[tmp]), %%xmm6\n\t" + "addss 12(%[tmp]), %%xmm6\n" "\n\t" # ifdef OF_AMD64 "movaps %%xmm8, %%xmm7\n\t" # else - "movaps 48(%2), %%xmm7\n\t" + "movaps 48(%[matrix]), %%xmm7\n\t" # endif "mulps %%xmm3, %%xmm7\n\t" - "movaps %%xmm7, (%3)\n\t" - "addss 4(%3), %%xmm7\n\t" - "addss 8(%3), %%xmm7\n\t" - "addss 12(%3), %%xmm7\n" - "\n\t" - "movss %%xmm4, (%1)\n\t" - "movss %%xmm5, 4(%1)\n\t" - "movss %%xmm6, 8(%1)\n\t" - "movss %%xmm7, 12(%1)\n" - "\n\t" - "add $16, %1\n\t" - "dec %0\n\t" + "movaps %%xmm7, (%[tmp])\n\t" + "addss 4(%[tmp]), %%xmm7\n\t" + "addss 8(%[tmp]), %%xmm7\n\t" + "addss 12(%[tmp]), %%xmm7\n" + "\n\t" + "movss %%xmm4, (%[vectors])\n\t" + "movss %%xmm5, 4(%[vectors])\n\t" + "movss %%xmm6, 8(%[vectors])\n\t" + "movss %%xmm7, 12(%[vectors])\n" + "\n\t" + "add $16, %[vectors]\n\t" + "dec %[count]\n\t" "jnz 0b\n" - : "+r"(count), "+r"(vectors) - : "r"(self->_values), "r"(&tmp) + : [count] "+r" (count), + [vectors] "+r" (vectors) + : [matrix] "r" (self->_values), + [tmp] "r" (&tmp) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", # ifdef OF_AMD64 "xmm8", # endif "memory" @@ -119,55 +121,58 @@ __asm__ __volatile__ ( "movl $4, %%ecx\n\t" "\n\t" "0:\n\t" - "movd (%2), %%mm0\n\t" - "punpckldq 16(%2), %%mm0\n\t" - "pfmul (%1), %%mm0\n\t" - "movd 32(%2), %%mm1\n\t" - "punpckldq 48(%2), %%mm1\n\t" - "pfmul 8(%1), %%mm1\n\t" - "pfacc %%mm1, %%mm0\n\t" - "pfacc %%mm0, %%mm0\n\t" - "movd %%mm0, (%0)\n\t" - "movd 4(%2), %%mm0\n\t" - "punpckldq 20(%2), %%mm0\n\t" - "pfmul (%1), %%mm0\n\t" - "movd 36(%2), %%mm1\n\t" - "punpckldq 52(%2), %%mm1\n\t" - "pfmul 8(%1), %%mm1\n\t" - "pfacc %%mm1, %%mm0\n\t" - "pfacc %%mm0, %%mm0\n\t" - "movd %%mm0, 4(%0)\n\t" - "movd 8(%2), %%mm0\n\t" - "punpckldq 24(%2), %%mm0\n\t" - "pfmul (%1), %%mm0\n\t" - "movd 40(%2), %%mm1\n\t" - "punpckldq 56(%2), %%mm1\n\t" - "pfmul 8(%1), %%mm1\n\t" - "pfacc %%mm1, %%mm0\n\t" - "pfacc %%mm0, %%mm0\n\t" - "movd %%mm0, 8(%0)\n\t" - "movd 12(%2), %%mm0\n\t" - "punpckldq 28(%2), %%mm0\n\t" - "pfmul (%1), %%mm0\n\t" - "movd 44(%2), %%mm1\n\t" - "punpckldq 60(%2), %%mm1\n\t" - "pfmul 8(%1), %%mm1\n\t" - "pfacc %%mm1, %%mm0\n\t" - "pfacc %%mm0, %%mm0\n\t" - "movd %%mm0, 12(%0)\n" - "\n\t" - "add $16, %0\n\t" - "add $16, %1\n\t" + "movd (%[right]), %%mm0\n\t" + "punpckldq 16(%[right]), %%mm0\n\t" + "pfmul (%[left]), %%mm0\n\t" + "movd 32(%[right]), %%mm1\n\t" + "punpckldq 48(%[right]), %%mm1\n\t" + "pfmul 8(%[left]), %%mm1\n\t" + "pfacc %%mm1, %%mm0\n\t" + "pfacc %%mm0, %%mm0\n\t" + "movd %%mm0, (%[result])\n\t" + "movd 4(%[right]), %%mm0\n\t" + "punpckldq 20(%[right]), %%mm0\n\t" + "pfmul (%[left]), %%mm0\n\t" + "movd 36(%[right]), %%mm1\n\t" + "punpckldq 52(%[right]), %%mm1\n\t" + "pfmul 8(%[left]), %%mm1\n\t" + "pfacc %%mm1, %%mm0\n\t" + "pfacc %%mm0, %%mm0\n\t" + "movd %%mm0, 4(%[result])\n\t" + "movd 8(%[right]), %%mm0\n\t" + "punpckldq 24(%[right]), %%mm0\n\t" + "pfmul (%[left]), %%mm0\n\t" + "movd 40(%[right]), %%mm1\n\t" + "punpckldq 56(%[right]), %%mm1\n\t" + "pfmul 8(%[left]), %%mm1\n\t" + "pfacc %%mm1, %%mm0\n\t" + "pfacc %%mm0, %%mm0\n\t" + "movd %%mm0, 8(%[result])\n\t" + "movd 12(%[right]), %%mm0\n\t" + "punpckldq 28(%[right]), %%mm0\n\t" + "pfmul (%[left]), %%mm0\n\t" + "movd 44(%[right]), %%mm1\n\t" + "punpckldq 60(%[right]), %%mm1\n\t" + "pfmul 8(%[left]), %%mm1\n\t" + "pfacc %%mm1, %%mm0\n\t" + "pfacc %%mm0, %%mm0\n\t" + "movd %%mm0, 12(%[result])\n" + "\n\t" + "add $16, %[result]\n\t" + "add $16, %[left]\n\t" "decl %%ecx\n\t" "jnz 0b\n" "\n\t" "femms" - : "+r"(resultPtr), "+r"(left), "+r"(right) - :: "ecx", "mm0", "mm1", "memory" + : [result] "+r" (resultPtr), + [left] "+r" (left), + [right] "+r" (right) + : + : "ecx", "mm0", "mm1", "memory" ); memcpy(self->_values, result, 16 * sizeof(float)); } @@ -174,57 +179,58 @@ static void transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { __asm__ __volatile__ ( - "test %0, %0\n\t" + "test %[count], %[count]\n\t" "jz 0f\n" "\n\t" "0:\n\t" - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n" + "movq (%[vectors]), %%mm0\n\t" + "movq 8(%[vectors]), %%mm1\n" "\n\t" "movq %%mm0, %%mm2\n\t" "movq %%mm1, %%mm3\n\t" - "pfmul (%2), %%mm2\n\t" - "pfmul 8(%2), %%mm3\n\t" + "pfmul (%[matrix]), %%mm2\n\t" + "pfmul 8(%[matrix]), %%mm3\n\t" "pfacc %%mm3, %%mm2\n\t" "pfacc %%mm2, %%mm2\n\t" "\n\t" "movq %%mm0, %%mm3\n\t" "movq %%mm1, %%mm4\n\t" - "pfmul 16(%2), %%mm3\n\t" - "pfmul 24(%2), %%mm4\n\t" + "pfmul 16(%[matrix]), %%mm3\n\t" + "pfmul 24(%[matrix]), %%mm4\n\t" "pfacc %%mm4, %%mm3\n\t" "pfacc %%mm3, %%mm3\n\t" "\n\t" "punpckldq %%mm3, %%mm2\n\t" - "movq %%mm2, (%1)\n" + "movq %%mm2, (%[vectors])\n" "\n\t" "movq %%mm0, %%mm2\n\t" "movq %%mm1, %%mm3\n\t" - "pfmul 32(%2), %%mm2\n\t" - "pfmul 40(%2), %%mm3\n\t" + "pfmul 32(%[matrix]), %%mm2\n\t" + "pfmul 40(%[matrix]), %%mm3\n\t" "pfacc %%mm3, %%mm2\n\t" "pfacc %%mm2, %%mm2\n\t" "\n\t" - "pfmul 48(%2), %%mm0\n\t" - "pfmul 56(%2), %%mm1\n\t" + "pfmul 48(%[matrix]), %%mm0\n\t" + "pfmul 56(%[matrix]), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "\n\t" "punpckldq %%mm0, %%mm2\n\t" - "movq %%mm2, 8(%1)\n" + "movq %%mm2, 8(%[vectors])\n" "\n\t" - "add $16, %1\n\t" - "dec %0\n\t" + "add $16, %[vectors]\n\t" + "dec %[count]\n\t" "jnz 0b\n" "\n\t" "0:\n\t" "femms" - : "+r"(count), "+r"(vectors) - : "r"(self->_values) + : [count] "+r" (count), + [vectors] "+r" (vectors) + : [matrix] "r" (self->_values) : "mm0", "mm1", "mm2", "mm3", "mm4", "memory" ); } # ifndef __clang__ # pragma GCC pop_options