Overview
Comment: | OFMatrix4x4: Unroll inner loop in 3DNow! version
This results in a ~ 16% performance improvement on a Duron 750. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
3280466d358455183daebacdccb7be91 |
User & Date: | js on 2023-11-05 18:18:05 |
Other Links: | manifest | tags |
Context
2023-11-06
| ||
00:59 | OFMatrix4x4: SSE1 for -[transformVectors:count:] check-in: cf955413ab user: js tags: trunk | |
2023-11-05
| ||
18:18 | OFMatrix4x4: Unroll inner loop in 3DNow! version check-in: 3280466d35 user: js tags: trunk | |
13:37 | OFMatrix4x4: Improve 3DNow! implementation check-in: 34b48a4208 user: js tags: trunk | |
Changes
Modified src/OFMatrix4x4.m from [66a4a58f6e] to [b31d5e9464].
︙ | ︙ | |||
81 82 83 84 85 86 87 | static void multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float (*left)[4] = matrix->_values, (*right)[4] = self->_values; float result[4][4], (*resultPtr)[4] = result; __asm__ __volatile__ ( | | | | > | > > > > > | | > > > > > > > | > > > > > > > > | | | < | < | | | 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | static void multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float (*left)[4] = matrix->_values, (*right)[4] = self->_values; float result[4][4], (*resultPtr)[4] = result; __asm__ __volatile__ ( "movl $4, %%ecx\n\t" "\n\t" "0:\n\t" "movd (%2), %%mm0\n\t" "punpckldq 16(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 32(%2), %%mm1\n\t" "punpckldq 48(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, (%0)\n\t" "movd 4(%2), %%mm0\n\t" "punpckldq 20(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 36(%2), %%mm1\n\t" "punpckldq 52(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 4(%0)\n\t" "movd 8(%2), %%mm0\n\t" "punpckldq 24(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 40(%2), %%mm1\n\t" "punpckldq 56(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 8(%0)\n\t" "movd 12(%2), %%mm0\n\t" "punpckldq 28(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 44(%2), %%mm1\n\t" "punpckldq 60(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 12(%0)\n" "\n\t" "add $16, %0\n\t" "add $16, %1\n\t" "decl %%ecx\n\t" "jnz 0b\n" "\n\t" "femms" : "+r"(resultPtr), "+r"(left), "+r"(right) :: "ecx", "mm0", "mm1", "memory" ); memcpy(self->_values, result, 16 * sizeof(float)); } static void transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, |
︙ | ︙ |