Index: src/OFMatrix4x4.h ================================================================== --- src/OFMatrix4x4.h +++ src/OFMatrix4x4.h @@ -21,11 +21,11 @@ * @brief A 4x4 matrix of floats. */ OF_SUBCLASSING_RESTRICTED @interface OFMatrix4x4: OFObject { - float _values[4][4]; + float _values[4][4] OF_ALIGN(16); } #ifdef OF_HAVE_CLASS_PROPERTIES @property (readonly, class) OFMatrix4x4 *identityMatrix; #endif Index: src/OFMatrix4x4.m ================================================================== --- src/OFMatrix4x4.m +++ src/OFMatrix4x4.m @@ -28,10 +28,54 @@ { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) +# ifndef __clang__ +# pragma GCC push_options +# pragma GCC target("sse4.1") +# endif +static void +transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, + size_t count) +{ + __asm__ __volatile__ ( + "test %0, %0\n\t" + "jz 0f\n" + "\n\t" + "movaps (%2), %%xmm0\n\t" + "movaps 16(%2), %%xmm1\n\t" + "movaps 32(%2), %%xmm2\n\t" + "movaps 48(%2), %%xmm3\n" + "\n\t" + "0:\n\t" + "movaps (%1), %%xmm4\n\t" + "movaps %%xmm4, %%xmm5\n\t" + "dpps $0xFF, %%xmm0, %%xmm4\n\t" + "movaps %%xmm5, %%xmm6\n\t" + "dpps $0xFF, %%xmm1, %%xmm5\n\t" + "movaps %%xmm6, %%xmm7\n\t" + "dpps $0xFF, %%xmm2, %%xmm6\n\t" + "dpps $0xFF, %%xmm3, %%xmm7\n\t" + "insertps $0x10, %%xmm5, %%xmm4\n\t" + "insertps $0x20, %%xmm6, %%xmm4\n\t" + "insertps $0x30, %%xmm7, %%xmm4\n\t" + "movaps %%xmm4, (%1)\n" + "\n\t" + "add $16, %1\n\t" + "dec %0\n\t" + "jnz 0b\n" + : "+r"(count), "+r"(vectors) + : "r"(&self->_values) + : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "memory" + ); +} +# ifndef __clang__ +# pragma GCC pop_options +# endif + # ifndef __clang__ # pragma GCC push_options # pragma GCC target("3dnow,3dnowa") # endif static void @@ -264,11 +308,14 @@ # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); - if ([OFSystemInfo supports3DNow]) { + if ([OFSystemInfo supportsSSE41]) { + REPLACE(@selector(transformVectors:count:), + transformVectors_SSE41) + } else if ([OFSystemInfo supports3DNow]) { if ([OFSystemInfo supportsEnhanced3DNow]) { REPLACE(@selector(multiplyWithMatrix:), multiplyWithMatrix_enhanced3DNow) REPLACE(@selector(transformVectors:count:), transformVectors_enhanced3DNow) Index: src/OFObject.h ================================================================== --- src/OFObject.h +++ src/OFObject.h @@ -347,11 +347,11 @@ /** * @struct OFVector4D OFObject.h ObjFW/OFObject.h * * @brief A vector in 4D space. */ -typedef struct OF_BOXABLE OFVector4D { +typedef struct OF_BOXABLE OF_ALIGN(16) OFVector4D { /** The x coordinate of the vector */ float x; /** The y coordinate of the vector */ float y; /** The z coordinate of the vector */ Index: src/macros.h ================================================================== --- src/macros.h +++ src/macros.h @@ -99,17 +99,21 @@ # define OF_UNLIKELY(cond) (cond) # define OF_CONST_FUNC # define OF_NO_RETURN_FUNC # define OF_WEAK_REF(sym) #endif + +#ifdef __GNUC__ +# define OF_ALIGN(alignment) __attribute__((__aligned__(alignment))) +#endif #if __STDC_VERSION__ >= 201112L # define OF_ALIGNOF(type) _Alignof(type) # define OF_ALIGNAS(type) _Alignas(type) #else # define OF_ALIGNOF(type) __alignof__(type) -# define OF_ALIGNAS(type) __attribute__((__aligned__(__alignof__(type)))) +# define OF_ALIGNAS(type) OF_ALIGN(OF_ALIGNOF(type)) #endif #if __STDC_VERSION__ >= 201112L && defined(OF_HAVE_MAX_ALIGN_T) # define OF_BIGGEST_ALIGNMENT _Alignof(max_align_t) #else