@@ -30,11 +30,11 @@ @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) # ifndef __clang__ # pragma GCC push_options -# pragma GCC target("3dnow") +# pragma GCC target("3dnow,3dnowa") # endif static void multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { @@ -74,10 +74,78 @@ :: "cx", "mm0", "mm1", "memory" ); memcpy(self->_values, result, sizeof(result)); } + +static void +transformVectors_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, + size_t count) +{ + __asm__ __volatile__ ( + "test %0, %0\n\t" + "jz 0f\n" + "\n\t" + "0:\n\t" + "movq (%1), %%mm0\n\t" + "movq 8(%1), %%mm1\n" + "\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "pfmul (%2), %%mm2\n\t" + "pfmul 8(%2), %%mm3\n\t" + "pfadd %%mm3, %%mm2\n\t" + "pswapd %%mm2, %%mm3\n\t" + "pfadd %%mm3, %%mm2\n" + "\n\t" + "movq %%mm0, %%mm3\n\t" + "movq %%mm1, %%mm4\n\t" + "pfmul 16(%2), %%mm3\n\t" + "pfmul 24(%2), %%mm4\n\t" + "pfadd %%mm4, %%mm3\n\t" + "pswapd %%mm3, %%mm4\n\t" + "pfadd %%mm4, %%mm3\n" + "\n\t" + "punpckldq %%mm3, %%mm2\n\t" + "movq %%mm2, (%1)\n" + "\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "pfmul 32(%2), %%mm2\n\t" + "pfmul 40(%2), %%mm3\n\t" + "pfadd %%mm3, %%mm2\n\t" + "pswapd %%mm2, %%mm3\n\t" + "pfadd %%mm3, %%mm2\n" + "\n\t" + "pfmul 48(%2), %%mm0\n\t" + "pfmul 56(%2), %%mm1\n\t" + "pfadd %%mm1, %%mm0\n\t" + "pswapd %%mm0, %%mm1\n\t" + "pfadd %%mm1, %%mm0\n" + "\n\t" + "punpckldq %%mm0, %%mm2\n\t" + "movq %%mm2, 8(%1)\n" + "\n\t" + "add $16, %1\n\t" + "dec %0\n\t" + "jnz 0b\n" + "\n\t" + "0:\n\t" + "femms" + : "+r"(count), "+r"(vectors) + : "r"(&self->_values) + : "mm0", "mm1", "mm2", "mm3", "mm4", "memory" + ); +} +# ifndef __clang__ +# pragma GCC pop_options +# endif + +# ifndef __clang__ +# pragma GCC push_options +# pragma GCC target("3dnow") +# endif static void multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float *left = &matrix->_values[0][0], *right = &self->_values[0][0]; float result[4][4], *resultPtr = &result[0][0]; @@ -117,79 +185,19 @@ ); memcpy(self->_values, result, sizeof(result)); } -static void -transformVectors_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, - size_t count) -{ - __asm__ __volatile__ ( - "0:\n\t" - "test %0, %0\n\t" - "jz 0f\n" - "\n\t" - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n" - "\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "pfmul (%2), %%mm2\n\t" - "pfmul 8(%2), %%mm3\n\t" - "pfadd %%mm3, %%mm2\n\t" - "pswapd %%mm2, %%mm3\n\t" - "pfadd %%mm3, %%mm2\n" - "\n\t" - "movq %%mm0, %%mm3\n\t" - "movq %%mm1, %%mm4\n\t" - "pfmul 16(%2), %%mm3\n\t" - "pfmul 24(%2), %%mm4\n\t" - "pfadd %%mm4, %%mm3\n\t" - "pswapd %%mm3, %%mm4\n\t" - "pfadd %%mm4, %%mm3\n" - "\n\t" - "punpckldq %%mm3, %%mm2\n\t" - "movq %%mm2, (%1)\n" - "\n\t" - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "pfmul 32(%2), %%mm2\n\t" - "pfmul 40(%2), %%mm3\n\t" - "pfadd %%mm3, %%mm2\n\t" - "pswapd %%mm2, %%mm3\n\t" - "pfadd %%mm3, %%mm2\n" - "\n\t" - "pfmul 48(%2), %%mm0\n\t" - "pfmul 56(%2), %%mm1\n\t" - "pfadd %%mm1, %%mm0\n\t" - "pswapd %%mm0, %%mm1\n\t" - "pfadd %%mm1, %%mm0\n" - "\n\t" - "punpckldq %%mm0, %%mm2\n\t" - "movq %%mm2, 8(%1)\n" - "\n\t" - "add $16, %1\n\t" - "dec %0\n\t" - "jmp 0b\n" - "\n\t" - "0:\n\t" - "femms" - : "+r"(count), "+r"(vectors) - : "r"(&self->_values) - : "mm0", "mm1", "mm2", "mm3", "mm4", "memory" - ); -} - static void transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { __asm__ __volatile__ ( - "0:\n\t" "test %0, %0\n\t" "jz 0f\n" "\n\t" + "0:\n\t" "movq (%1), %%mm0\n\t" "movq 8(%1), %%mm1\n" "\n\t" "movq %%mm0, %%mm2\n\t" "movq %%mm1, %%mm3\n\t" @@ -231,11 +239,12 @@ "punpckldq %%mm0, %%mm2\n\t" "movq %%mm2, 8(%1)\n" "\n\t" "add $16, %1\n\t" "dec %0\n\t" - "jmp 0b\n" + "jnz 0b\n" + "\n\t" "0:\n\t" "femms" : "+r"(count), "+r"(vectors) : "r"(&self->_values) : "mm0", "mm1", "mm2", "mm3", "mm4", "memory" @@ -255,20 +264,22 @@ # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); - if ([OFSystemInfo supportsEnhanced3DNow]) { - REPLACE(@selector(multiplyWithMatrix:), - multiplyWithMatrix_enhanced3DNow) - REPLACE(@selector(transformVectors:count:), - transformVectors_enhanced3DNow) - } else if ([OFSystemInfo supports3DNow]) { - REPLACE(@selector(multiplyWithMatrix:), - multiplyWithMatrix_3DNow) - REPLACE(@selector(transformVectors:count:), - transformVectors_3DNow) + if ([OFSystemInfo supports3DNow]) { + if ([OFSystemInfo supportsEnhanced3DNow]) { + REPLACE(@selector(multiplyWithMatrix:), + multiplyWithMatrix_enhanced3DNow) + REPLACE(@selector(transformVectors:count:), + transformVectors_enhanced3DNow) + } else { + REPLACE(@selector(multiplyWithMatrix:), + multiplyWithMatrix_3DNow) + REPLACE(@selector(transformVectors:count:), + transformVectors_3DNow) + } } # undef REPLACE } #endif