Differences From Artifact [b53637018a]:
- File src/OFMatrix4x4.m — part of check-in [9ba7594f7b] at 2023-11-06 20:11:51 on branch trunk — OFMatrix4x4: Fix missing vector reload in SSE (user: js, size: 9780) [annotate] [blame] [check-ins using]
To Artifact [a4f9a5f812]:
- File
src/OFMatrix4x4.m
— part of check-in
[5edf0d083d]
at
2023-11-06 20:17:44
on branch trunk
— OFMatrix4x4: Use an extra SSE register on AMD64
This gives another nice speed improvement. (user: js, size: 9943) [annotate] [blame] [check-ins using]
︙ | ︙ | |||
42 43 44 45 46 47 48 | __asm__ __volatile__ ( "test %0, %0\n\t" "jz 0f\n" "\n\t" "movaps (%2), %%xmm0\n\t" "movaps 16(%2), %%xmm1\n\t" | | > > > | 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | __asm__ __volatile__ ( "test %0, %0\n\t" "jz 0f\n" "\n\t" "movaps (%2), %%xmm0\n\t" "movaps 16(%2), %%xmm1\n\t" "movaps 32(%2), %%xmm2\n\t" # ifdef OF_AMD64 "movaps 48(%2), %%xmm8\n" # endif "\n\t" "0:\n\t" "movaps (%1), %%xmm3\n" "\n\t" "movaps %%xmm0, %%xmm4\n\t" "mulps %%xmm3, %%xmm4\n\t" "movaps %%xmm4, (%3)\n\t" |
︙ | ︙ | |||
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | "movaps %%xmm2, %%xmm6\n\t" "mulps %%xmm3, %%xmm6\n\t" "movaps %%xmm6, (%3)\n\t" "addss 4(%3), %%xmm6\n\t" "addss 8(%3), %%xmm6\n\t" "addss 12(%3), %%xmm6\n" "\n\t" "movaps 48(%2), %%xmm7\n\t" "mulps %%xmm3, %%xmm7\n\t" "movaps %%xmm7, (%3)\n\t" "addss 4(%3), %%xmm7\n\t" "addss 8(%3), %%xmm7\n\t" "addss 12(%3), %%xmm7\n" "\n\t" "movss %%xmm4, (%1)\n\t" "movss %%xmm5, 4(%1)\n\t" "movss %%xmm6, 8(%1)\n\t" "movss %%xmm7, 12(%1)\n" "\n\t" "add $16, %1\n\t" "dec %0\n\t" "jnz 0b\n" : "+r"(count), "+r"(vectors) : "r"(self->_values), "r"(&tmp) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } # ifndef __clang__ # pragma GCC pop_options # endif | > > > > > > > | 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | "movaps %%xmm2, %%xmm6\n\t" "mulps %%xmm3, %%xmm6\n\t" "movaps %%xmm6, (%3)\n\t" "addss 4(%3), %%xmm6\n\t" "addss 8(%3), %%xmm6\n\t" "addss 12(%3), %%xmm6\n" "\n\t" # ifdef OF_AMD64 "movaps %%xmm8, %%xmm7\n\t" # else "movaps 48(%2), %%xmm7\n\t" # endif "mulps %%xmm3, %%xmm7\n\t" "movaps %%xmm7, (%3)\n\t" "addss 4(%3), %%xmm7\n\t" "addss 8(%3), %%xmm7\n\t" "addss 12(%3), %%xmm7\n" "\n\t" "movss %%xmm4, (%1)\n\t" "movss %%xmm5, 4(%1)\n\t" "movss %%xmm6, 8(%1)\n\t" "movss %%xmm7, 12(%1)\n" "\n\t" "add $16, %1\n\t" "dec %0\n\t" "jnz 0b\n" : "+r"(count), "+r"(vectors) : "r"(self->_values), "r"(&tmp) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", # ifdef OF_AMD64 "xmm8", # endif "memory" ); } # ifndef __clang__ # pragma GCC pop_options # endif |
︙ | ︙ |