Index: configure.ac ================================================================== --- configure.ac +++ configure.ac @@ -331,10 +331,20 @@ AX_CHECK_COMPILER_FLAGS(-Xclang -fno-constant-cfstrings, [ flag="-Xclang -fno-constant-cfstrings" OBJCFLAGS="$OBJCFLAGS $flag" OBJFW_OBJCFLAGS="$OBJFW_OBJCFLAGS $flag" ]) + +case "$host_cpu" in +i*86|x86_64) + AX_CHECK_COMPILER_FLAGS([-masm=intel], [ + OBJCFLAGS="$OBJCFLAGS -masm=intel" + AC_DEFINE(HAVE_INTEL_SYNTAX, 1, [Whether asm syntax is Intel]) + ]) + ;; +esac + AX_CHECK_COMPILER_FLAGS([-Wsign-compare -Werror], [OBJCFLAGS="$OBJCFLAGS -Wsign-compare"]) AS_IF([test x"$with_nds" != x"yes"], [ AX_CHECK_COMPILER_FLAGS([-Wshadow -Werror], [OBJCFLAGS="$OBJCFLAGS -Wshadow"]) Index: src/OFMatrix4x4.m ================================================================== --- src/OFMatrix4x4.m +++ src/OFMatrix4x4.m @@ -14,21 +14,70 @@ */ #include "config.h" #import "OFMatrix4x4.h" -#import "OFOnce.h" #import "OFString.h" +#import "OFSystemInfo.h" + +#import "OFOnce.h" static const float identityValues[4][4] = { { 1, 0, 0, 0 }, { 0, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 +#if (defined(OF_AMD64) || defined(OF_X86)) && defined(HAVE_INTEL_SYNTAX) +static void +multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) +{ + float result[4][4] = {{ 0 }}; + + for (uint_fast8_t i = 0; i < 4; i++) { + for (uint_fast8_t j = 0; j < 4; j++) { + __asm__ ( + "movd mm0, [%2]\n\t" + "punpckldq mm0, [%2 + 16]\n\t" + "pfmul mm0, [%1]\n\t" + "movd mm1, [%2 + 32]\n\t" + "punpckldq mm1, [%2 + 48]\n\t" + "pfmul mm1, [%1 + 8]\n\t" + "pfadd mm0, mm1\n\t" + "movq mm1, mm0\n\t" + "psrlq mm1, 32\n\t" + "pfadd mm0, mm1\n\t" + "movd %0, mm0" + :: "m"(result[i][j]), "r"(&matrix->_values[i][0]), + "r"(&self->_values[0][j]) + : "mm0", "mm1", "memory" + ); + } + } + + __asm__ ("femms"); + + memcpy(self->_values, result, sizeof(result)); +} + ++ (void)initialize +{ + if (self != [OFMatrix4x4 class]) + return; + + if ([OFSystemInfo supports3DNow]) { + const SEL selector = @selector(multiplyWithMatrix:); + const char *typeEncoding = method_getTypeEncoding( + class_getInstanceMethod(self, selector)); + class_replaceMethod(self, selector, + (IMP)multiplyWithMatrix_3DNow, typeEncoding); + } +} +#endif + + (OFMatrix4x4 *)identityMatrix { return [[[OFMatrix4x4 alloc] initWithValues: identityValues] autorelease]; } Index: src/OFSystemInfo.m ================================================================== --- src/OFSystemInfo.m +++ src/OFSystemInfo.m @@ -296,13 +296,13 @@ * as ebx is a special register in PIC code. Yes, GCC is indeed not * able to just push a register onto the stack before the __asm__ block * and to pop it afterwards. */ __asm__ ( - "xchgl %%ebx, %%edi\n\t" + "xchg{l} { %%ebx, %%edi | edi, ebx }\n\t" "cpuid\n\t" - "xchgl %%edi, %%ebx" + "xchg{l} { %%edi, %%ebx | ebx, edi }" : "=a"(regs.eax), "=D"(regs.ebx), "=c"(regs.ecx), "=d"(regs.edx) : "a"(eax), "c"(ecx) ); # else memset(®s, 0, sizeof(regs));