Overview
Comment: | OFMatrix4x4: Restore SSE4.1 code
The Clang alignment bug has been worked around in the previous commit, |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
0eb97e46123c1283123d0774270f36a9 |
User & Date: | js on 2023-11-04 13:10:43 |
Other Links: | manifest | tags |
Context
2023-11-04
| ||
13:13 | Make GCC happy again check-in: 998478014d user: js tags: trunk | |
13:10 | OFMatrix4x4: Restore SSE4.1 code check-in: 0eb97e4612 user: js tags: trunk | |
13:06 | Work around Clang not aligning ivars correctly check-in: 055e14fc75 user: js tags: trunk | |
2023-11-03
| ||
01:07 | OFMatrix4x4: Remove SSE4.1 due to Clang bugs check-in: 7e1dbda4b4 user: js tags: trunk | |
Changes
Modified src/OFMatrix4x4.m from [1cf8596c69] to [52a22d3f60].
︙ | ︙ | |||
26 27 28 29 30 31 32 33 34 35 36 37 38 39 | { 0, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) # ifndef __clang__ # pragma GCC push_options # pragma GCC target("3dnow,3dnowa") # endif static void multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | { 0, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) # ifndef __clang__ # pragma GCC push_options # pragma GCC target("sse4.1") # endif static void transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { __asm__ __volatile__ ( "test %0, %0\n\t" "jz 0f\n" "\n\t" "movaps (%2), %%xmm0\n\t" "movaps 16(%2), %%xmm1\n\t" "movaps 32(%2), %%xmm2\n\t" "movaps 48(%2), %%xmm3\n" "\n\t" "0:\n\t" "movaps (%1), %%xmm4\n\t" "movaps %%xmm4, %%xmm5\n\t" "dpps $0xFF, %%xmm0, %%xmm4\n\t" "movaps %%xmm5, %%xmm6\n\t" "dpps $0xFF, %%xmm1, %%xmm5\n\t" "movaps %%xmm6, %%xmm7\n\t" "dpps $0xFF, %%xmm2, %%xmm6\n\t" "dpps $0xFF, %%xmm3, %%xmm7\n\t" "insertps $0x10, %%xmm5, %%xmm4\n\t" "insertps $0x20, %%xmm6, %%xmm4\n\t" "insertps $0x30, %%xmm7, %%xmm4\n\t" "movaps %%xmm4, (%1)\n" "\n\t" "add $16, %1\n\t" "dec %0\n\t" "jnz 0b\n" : "+r"(count), "+r"(vectors) : "r"(self->_values) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } # ifndef __clang__ # pragma GCC pop_options # endif # ifndef __clang__ # pragma GCC push_options # pragma GCC target("3dnow,3dnowa") # endif static void multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) |
︙ | ︙ | |||
262 263 264 265 266 267 268 | return; # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); | | > > > | 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | return; # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); if ([OFSystemInfo supportsSSE41]) { REPLACE(@selector(transformVectors:count:), transformVectors_SSE41) } else if ([OFSystemInfo supports3DNow]) { if ([OFSystemInfo supportsEnhanced3DNow]) { REPLACE(@selector(multiplyWithMatrix:), multiplyWithMatrix_enhanced3DNow) REPLACE(@selector(transformVectors:count:), transformVectors_enhanced3DNow) } else { REPLACE(@selector(multiplyWithMatrix:), |
︙ | ︙ |