Overview
Comment: | OFMatrix4x4: Use an extra SSE register on AMD64
This gives another nice speed improvement. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
5edf0d083d8cbfc4777d4814f0199b24 |
User & Date: | js on 2023-11-06 20:17:44 |
Other Links: | manifest | tags |
Context
2023-11-09
| ||
21:09 | Use named operands for __asm__ check-in: 1b22456db6 user: js tags: trunk | |
2023-11-06
| ||
20:17 | OFMatrix4x4: Use an extra SSE register on AMD64 check-in: 5edf0d083d user: js tags: trunk | |
20:11 | OFMatrix4x4: Fix missing vector reload in SSE check-in: 9ba7594f7b user: js tags: trunk | |
Changes
Modified src/OFMatrix4x4.m from [b53637018a] to [a4f9a5f812].
︙ | |||
42 43 44 45 46 47 48 | 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | - + + + + | __asm__ __volatile__ ( "test %0, %0\n\t" "jz 0f\n" "\n\t" "movaps (%2), %%xmm0\n\t" "movaps 16(%2), %%xmm1\n\t" |
︙ | |||
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | + + + + + + + | "movaps %%xmm2, %%xmm6\n\t" "mulps %%xmm3, %%xmm6\n\t" "movaps %%xmm6, (%3)\n\t" "addss 4(%3), %%xmm6\n\t" "addss 8(%3), %%xmm6\n\t" "addss 12(%3), %%xmm6\n" "\n\t" # ifdef OF_AMD64 "movaps %%xmm8, %%xmm7\n\t" # else "movaps 48(%2), %%xmm7\n\t" # endif "mulps %%xmm3, %%xmm7\n\t" "movaps %%xmm7, (%3)\n\t" "addss 4(%3), %%xmm7\n\t" "addss 8(%3), %%xmm7\n\t" "addss 12(%3), %%xmm7\n" "\n\t" "movss %%xmm4, (%1)\n\t" "movss %%xmm5, 4(%1)\n\t" "movss %%xmm6, 8(%1)\n\t" "movss %%xmm7, 12(%1)\n" "\n\t" "add $16, %1\n\t" "dec %0\n\t" "jnz 0b\n" : "+r"(count), "+r"(vectors) : "r"(self->_values), "r"(&tmp) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", # ifdef OF_AMD64 "xmm8", # endif "memory" ); } # ifndef __clang__ # pragma GCC pop_options # endif |
︙ |