Differences From Artifact [b31d5e9464]:
- File
src/OFMatrix4x4.m
— part of check-in
[3280466d35]
at
2023-11-05 18:18:05
on branch trunk
— OFMatrix4x4: Unroll inner loop in 3DNow! version
This results in a ~ 16% performance improvement on a Duron 750. (user: js, size: 9256) [annotate] [blame] [check-ins using]
To Artifact [ac56c8b767]:
- File
src/OFMatrix4x4.m
— part of check-in
[cf955413ab]
at
2023-11-06 00:59:37
on branch trunk
— OFMatrix4x4: SSE1 for -[transformVectors:count:]
This new SSE1 implementation is better than the SSE4.1 implementation,
hence this also deletes the SSE4.1 implementation. (user: js, size: 9770) [annotate] [blame] [check-ins using]
︙ | ︙ | |||
28 29 30 31 32 33 34 | { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) # ifndef __clang__ # pragma GCC push_options | | | > > | > | | | > > > > > | | > > > > > | | > > > > | > | | | > | > > > | | 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) # ifndef __clang__ # pragma GCC push_options # pragma GCC target("sse") # endif static void transformVectors_SSE(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { OF_ALIGN(16) float tmp[4]; __asm__ __volatile__ ( "test %0, %0\n\t" "jz 0f\n" "\n\t" "movaps (%2), %%xmm0\n\t" "movaps 16(%2), %%xmm1\n\t" "movaps 32(%2), %%xmm2\n\t" "movaps (%1), %%xmm3\n" "\n\t" "0:\n\t" "movaps %%xmm0, %%xmm4\n\t" "mulps %%xmm3, %%xmm4\n\t" "movaps %%xmm4, (%3)\n\t" "addss 4(%3), %%xmm4\n\t" "addss 8(%3), %%xmm4\n\t" "addss 12(%3), %%xmm4\n" "\n\t" "movaps %%xmm1, %%xmm5\n\t" "mulps %%xmm3, %%xmm5\n\t" "movaps %%xmm5, (%3)\n\t" "addss 4(%3), %%xmm5\n\t" "addss 8(%3), %%xmm5\n\t" "addss 12(%3), %%xmm5\n" "\n\t" "movaps %%xmm2, %%xmm6\n\t" "mulps %%xmm3, %%xmm6\n\t" "movaps %%xmm6, (%3)\n\t" "addss 4(%3), %%xmm6\n\t" "addss 8(%3), %%xmm6\n\t" "addss 12(%3), %%xmm6\n" "\n\t" "movaps 48(%2), %%xmm7\n\t" "mulps %%xmm3, %%xmm7\n\t" "movaps %%xmm7, (%3)\n\t" "addss 4(%3), %%xmm7\n\t" "addss 8(%3), %%xmm7\n\t" "addss 12(%3), %%xmm7\n" "\n\t" "movss %%xmm4, (%1)\n\t" "movss %%xmm5, 4(%1)\n\t" "movss %%xmm6, 8(%1)\n\t" "movss %%xmm7, 12(%1)\n" "\n\t" "add $16, %1\n\t" "dec %0\n\t" "jnz 0b\n" : "+r"(count), "+r"(vectors) : "r"(self->_values), "r"(&tmp) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } # ifndef __clang__ # pragma GCC pop_options # endif |
︙ | ︙ | |||
205 206 207 208 209 210 211 | return; # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); | | | | 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | return; # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); if ([OFSystemInfo supportsSSE]) { REPLACE(@selector(transformVectors:count:), transformVectors_SSE) } else if ([OFSystemInfo supports3DNow]) { REPLACE(@selector(multiplyWithMatrix:), multiplyWithMatrix_3DNow) REPLACE(@selector(transformVectors:count:), transformVectors_3DNow) } |
︙ | ︙ |