Differences From Artifact [740fcdd16f]:
- File src/OFMatrix4x4.m — part of check-in [b5c3a36731] at 2023-11-02 23:00:22 on branch trunk — OFMatrix4x4: Minor cleanups (user: js, size: 10267) [annotate] [blame] [check-ins using] [more...]
To Artifact [46f1ac4383]:
- File
src/OFMatrix4x4.m
— part of check-in
[7f304f573b]
at
2023-11-03 00:16:18
on branch trunk
— OFMatrix4x4: SSE4.1 for -[transformVectors:count:]
This requires the vectors to be 16 byte aligned. In order to achieve
this, the OFVector4D type is changed to have an alignment of 16 bytes.
However, this does *not* break ABI because the only method actually
requiring 16 byte alignment is -[transformVectors:count:], which was not
in ObjFW 1.0. Hence binaries compiled for ObjFW 1.0 have no 16 byte
alignment for OFVector4D, but also cannot ever call into any code that
needs it. (-[transformedVector:] calls into -[transformVectors:count:],
but creates a properly aligned copy that it passes.) (user: js, size: 11520) [annotate] [blame] [check-ins using]
︙ | ︙ | |||
26 27 28 29 30 31 32 33 34 35 36 37 38 39 | { 0, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) # ifndef __clang__ # pragma GCC push_options # pragma GCC target("3dnow,3dnowa") # endif static void multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | { 0, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) # ifndef __clang__ # pragma GCC push_options # pragma GCC target("sse4.1") # endif static void transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { __asm__ __volatile__ ( "test %0, %0\n\t" "jz 0f\n" "\n\t" "movaps (%2), %%xmm0\n\t" "movaps 16(%2), %%xmm1\n\t" "movaps 32(%2), %%xmm2\n\t" "movaps 48(%2), %%xmm3\n" "\n\t" "0:\n\t" "movaps (%1), %%xmm4\n\t" "movaps %%xmm4, %%xmm5\n\t" "dpps $0xFF, %%xmm0, %%xmm4\n\t" "movaps %%xmm5, %%xmm6\n\t" "dpps $0xFF, %%xmm1, %%xmm5\n\t" "movaps %%xmm6, %%xmm7\n\t" "dpps $0xFF, %%xmm2, %%xmm6\n\t" "dpps $0xFF, %%xmm3, %%xmm7\n\t" "insertps $0x10, %%xmm5, %%xmm4\n\t" "insertps $0x20, %%xmm6, %%xmm4\n\t" "insertps $0x30, %%xmm7, %%xmm4\n\t" "movaps %%xmm4, (%1)\n" "\n\t" "add $16, %1\n\t" "dec %0\n\t" "jnz 0b\n" : "+r"(count), "+r"(vectors) : "r"(&self->_values) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } # ifndef __clang__ # pragma GCC pop_options # endif # ifndef __clang__ # pragma GCC push_options # pragma GCC target("3dnow,3dnowa") # endif static void multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) |
︙ | ︙ | |||
262 263 264 265 266 267 268 | return; # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); | | > > > | 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | return; # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); if ([OFSystemInfo supportsSSE41]) { REPLACE(@selector(transformVectors:count:), transformVectors_SSE41) } else if ([OFSystemInfo supports3DNow]) { if ([OFSystemInfo supportsEnhanced3DNow]) { REPLACE(@selector(multiplyWithMatrix:), multiplyWithMatrix_enhanced3DNow) REPLACE(@selector(transformVectors:count:), transformVectors_enhanced3DNow) } else { REPLACE(@selector(multiplyWithMatrix:), |
︙ | ︙ |