@@ -83,11 +83,11 @@ { float (*left)[4] = matrix->_values, (*right)[4] = self->_values; float result[4][4], (*resultPtr)[4] = result; __asm__ __volatile__ ( - "xorw %%cx, %%cx\n" + "movl $4, %%ecx\n\t" "\n\t" "0:\n\t" "movd (%2), %%mm0\n\t" "punpckldq 16(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" @@ -94,28 +94,47 @@ "movd 32(%2), %%mm1\n\t" "punpckldq 48(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" - "movd %%mm0, (%0)\n" - "\n\t" - "add $4, %0\n\t" - "add $4, %2\n\t" - "incb %%cl\n\t" - "cmpb $4, %%cl\n\t" - "jb 0b\n" - "\n\t" + "movd %%mm0, (%0)\n\t" + "movd 4(%2), %%mm0\n\t" + "punpckldq 20(%2), %%mm0\n\t" + "pfmul (%1), %%mm0\n\t" + "movd 36(%2), %%mm1\n\t" + "punpckldq 52(%2), %%mm1\n\t" + "pfmul 8(%1), %%mm1\n\t" + "pfacc %%mm1, %%mm0\n\t" + "pfacc %%mm0, %%mm0\n\t" + "movd %%mm0, 4(%0)\n\t" + "movd 8(%2), %%mm0\n\t" + "punpckldq 24(%2), %%mm0\n\t" + "pfmul (%1), %%mm0\n\t" + "movd 40(%2), %%mm1\n\t" + "punpckldq 56(%2), %%mm1\n\t" + "pfmul 8(%1), %%mm1\n\t" + "pfacc %%mm1, %%mm0\n\t" + "pfacc %%mm0, %%mm0\n\t" + "movd %%mm0, 8(%0)\n\t" + "movd 12(%2), %%mm0\n\t" + "punpckldq 28(%2), %%mm0\n\t" + "pfmul (%1), %%mm0\n\t" + "movd 44(%2), %%mm1\n\t" + "punpckldq 60(%2), %%mm1\n\t" + "pfmul 8(%1), %%mm1\n\t" + "pfacc %%mm1, %%mm0\n\t" + "pfacc %%mm0, %%mm0\n\t" + "movd %%mm0, 12(%0)\n" + "\n\t" + "add $16, %0\n\t" "add $16, %1\n\t" - "sub $16, %2\n\t" - "xorb %%cl, %%cl\n\t" - "incb %%ch\n\t" - "cmpb $4, %%ch\n\t" - "jb 0b\n" + "decl %%ecx\n\t" + "jnz 0b\n" "\n\t" "femms" : "+r"(resultPtr), "+r"(left), "+r"(right) - :: "cx", "mm0", "mm1", "memory" + :: "ecx", "mm0", "mm1", "memory" ); memcpy(self->_values, result, 16 * sizeof(float)); }