@@ -27,30 +27,30 @@ { 0, 0, 1, 0 }, { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 -#if (defined(OF_AMD64) || defined(OF_X86)) && defined(HAVE_INTEL_SYNTAX) +#if defined(OF_AMD64) || defined(OF_X86) static void multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { - float result[4][4] = {{ 0 }}; + float result[4][4]; for (uint_fast8_t i = 0; i < 4; i++) { for (uint_fast8_t j = 0; j < 4; j++) { __asm__ ( - "movd mm0, [%2]\n\t" - "punpckldq mm0, [%2 + 16]\n\t" - "pfmul mm0, [%1]\n\t" - "movd mm1, [%2 + 32]\n\t" - "punpckldq mm1, [%2 + 48]\n\t" - "pfmul mm1, [%1 + 8]\n\t" - "pfadd mm0, mm1\n\t" - "movq mm1, mm0\n\t" - "psrlq mm1, 32\n\t" - "pfadd mm0, mm1\n\t" - "movd %0, mm0" + "movd (%2), %%mm0\n\t" + "punpckldq 16(%2), %%mm0\n\t" + "pfmul (%1), %%mm0\n\t" + "movd 32(%2), %%mm1\n\t" + "punpckldq 48(%2), %%mm1\n\t" + "pfmul 8(%1), %%mm1\n\t" + "pfadd %%mm1, %%mm0\n\t" + "movq %%mm0, %%mm1\n\t" + "psrlq $32, %%mm1\n\t" + "pfadd %%mm1, %%mm0\n\t" + "movd %%mm0, %0" :: "m"(result[i][j]), "r"(&matrix->_values[i][0]), "r"(&self->_values[0][j]) : "mm0", "mm1", "memory" ); } @@ -65,52 +65,52 @@ transformedVector_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D vector) { OFVector4D result; __asm__ ( - "movq mm0, [%2]\n\t" - "movq mm1, [%2 + 8]\n" - "\n\t" - "movq mm2, mm0\n\t" - "movq mm3, mm1\n\t" - "pfmul mm2, [%1]\n\t" - "pfmul mm3, [%1 + 8]\n\t" - "pfadd mm2, mm3\n\t" - "movq mm3, mm2\n\t" - "psrlq mm3, 32\n\t" - "pfadd mm2, mm3\n" - "\n\t" - "movq mm3, mm0\n\t" - "movq mm4, mm1\n\t" - "pfmul mm3, [%1 + 16]\n\t" - "pfmul mm4, [%1 + 24]\n\t" - "pfadd mm3, mm4\n\t" - "movq mm4, mm3\n\t" - "psrlq mm4, 32\n\t" - "pfadd mm3, mm4\n" - "\n\t" - "punpckldq mm2, mm3\n\t" - "movq [%0], mm2\n" - "\n\t" - "movq mm2, mm0\n\t" - "movq mm3, mm1\n\t" - "pfmul mm2, [%1 + 32]\n\t" - "pfmul mm3, [%1 + 40]\n\t" - "pfadd mm2, mm3\n\t" - "movq mm3, mm2\n\t" - "psrlq mm3, 32\n\t" - "pfadd mm2, mm3\n" - "\n\t" - "pfmul mm0, [%1 + 48]\n\t" - "pfmul mm1, [%1 + 56]\n\t" - "pfadd mm0, mm1\n\t" - "movq mm1, mm0\n\t" - "psrlq mm1, 32\n\t" - "pfadd mm0, mm1\n" - "\n\t" - "punpckldq mm2, mm0\n\t" - "movq [%0 + 8], mm2\n" + "movq (%2), %%mm0\n\t" + "movq 8(%2), %%mm1\n" + "\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "pfmul (%1), %%mm2\n\t" + "pfmul 8(%1), %%mm3\n\t" + "pfadd %%mm3, %%mm2\n\t" + "movq %%mm2, %%mm3\n\t" + "psrlq $32, %%mm3\n\t" + "pfadd %%mm3, %%mm2\n" + "\n\t" + "movq %%mm0, %%mm3\n\t" + "movq %%mm1, %%mm4\n\t" + "pfmul 16(%1), %%mm3\n\t" + "pfmul 24(%1), %%mm4\n\t" + "pfadd %%mm4, %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "psrlq $32, %%mm4\n\t" + "pfadd %%mm4, %%mm3\n" + "\n\t" + "punpckldq %%mm3, %%mm2\n\t" + "movq %%mm2, (%0)\n" + "\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "pfmul 32(%1), %%mm2\n\t" + "pfmul 40(%1), %%mm3\n\t" + "pfadd %%mm3, %%mm2\n\t" + "movq %%mm2, %%mm3\n\t" + "psrlq $32, %%mm3\n\t" + "pfadd %%mm3, %%mm2\n" + "\n\t" + "pfmul 48(%1), %%mm0\n\t" + "pfmul 56(%1), %%mm1\n\t" + "pfadd %%mm1, %%mm0\n\t" + "movq %%mm0, %%mm1\n\t" + "psrlq $32, %%mm1\n\t" + "pfadd %%mm1, %%mm0\n" + "\n\t" + "punpckldq %%mm0, %%mm2\n\t" + "movq %%mm2, 8(%0)\n" "\n\t" "femms" :: "r"(&result), "r"(&self->_values), "r"(&vector) : "mm0", "mm1", "mm2", "mm3", "mm4", "memory" );