Index: src/OFMatrix4x4.m ================================================================== --- src/OFMatrix4x4.m +++ src/OFMatrix4x4.m @@ -28,10 +28,54 @@ { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) +# ifndef __clang__ +# pragma GCC push_options +# pragma GCC target("sse4.1") +# endif +static void +transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, + size_t count) +{ + __asm__ __volatile__ ( + "test %0, %0\n\t" + "jz 0f\n" + "\n\t" + "movaps (%2), %%xmm0\n\t" + "movaps 16(%2), %%xmm1\n\t" + "movaps 32(%2), %%xmm2\n\t" + "movaps 48(%2), %%xmm3\n" + "\n\t" + "0:\n\t" + "movaps (%1), %%xmm4\n\t" + "movaps %%xmm4, %%xmm5\n\t" + "dpps $0xFF, %%xmm0, %%xmm4\n\t" + "movaps %%xmm5, %%xmm6\n\t" + "dpps $0xFF, %%xmm1, %%xmm5\n\t" + "movaps %%xmm6, %%xmm7\n\t" + "dpps $0xFF, %%xmm2, %%xmm6\n\t" + "dpps $0xFF, %%xmm3, %%xmm7\n\t" + "insertps $0x10, %%xmm5, %%xmm4\n\t" + "insertps $0x20, %%xmm6, %%xmm4\n\t" + "insertps $0x30, %%xmm7, %%xmm4\n\t" + "movaps %%xmm4, (%1)\n" + "\n\t" + "add $16, %1\n\t" + "dec %0\n\t" + "jnz 0b\n" + : "+r"(count), "+r"(vectors) + : "r"(self->_values) + : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "memory" + ); +} +# ifndef __clang__ +# pragma GCC pop_options +# endif + # ifndef __clang__ # pragma GCC push_options # pragma GCC target("3dnow,3dnowa") # endif static void @@ -264,11 +308,14 @@ # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); - if ([OFSystemInfo supports3DNow]) { + if ([OFSystemInfo supportsSSE41]) { + REPLACE(@selector(transformVectors:count:), + transformVectors_SSE41) + } else if ([OFSystemInfo supports3DNow]) { if ([OFSystemInfo supportsEnhanced3DNow]) { REPLACE(@selector(multiplyWithMatrix:), multiplyWithMatrix_enhanced3DNow) REPLACE(@selector(transformVectors:count:), transformVectors_enhanced3DNow)