Differences From Artifact [66a4a58f6e]:
- File
src/OFMatrix4x4.m
— part of check-in
[34b48a4208]
at
2023-11-05 13:37:08
on branch trunk
— OFMatrix4x4: Improve 3DNow! implementation
The new 3DNow! implementation is now better than the Enhanced 3DNow!
implementation, hence this also deletes the Enhanced 3DNow!
implementation. (user: js, size: 8587) [annotate] [blame] [check-ins using]
To Artifact [b31d5e9464]:
- File
src/OFMatrix4x4.m
— part of check-in
[3280466d35]
at
2023-11-05 18:18:05
on branch trunk
— OFMatrix4x4: Unroll inner loop in 3DNow! version
This results in a ~ 16% performance improvement on a Duron 750. (user: js, size: 9256) [annotate] [blame] [check-ins using]
︙ | ︙ | |||
81 82 83 84 85 86 87 | static void multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float (*left)[4] = matrix->_values, (*right)[4] = self->_values; float result[4][4], (*resultPtr)[4] = result; __asm__ __volatile__ ( | | | | > | > > > > > | | > > > > > > > | > > > > > > > > | | | < | < | | | 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | static void multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float (*left)[4] = matrix->_values, (*right)[4] = self->_values; float result[4][4], (*resultPtr)[4] = result; __asm__ __volatile__ ( "movl $4, %%ecx\n\t" "\n\t" "0:\n\t" "movd (%2), %%mm0\n\t" "punpckldq 16(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 32(%2), %%mm1\n\t" "punpckldq 48(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, (%0)\n\t" "movd 4(%2), %%mm0\n\t" "punpckldq 20(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 36(%2), %%mm1\n\t" "punpckldq 52(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 4(%0)\n\t" "movd 8(%2), %%mm0\n\t" "punpckldq 24(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 40(%2), %%mm1\n\t" "punpckldq 56(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 8(%0)\n\t" "movd 12(%2), %%mm0\n\t" "punpckldq 28(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 44(%2), %%mm1\n\t" "punpckldq 60(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 12(%0)\n" "\n\t" "add $16, %0\n\t" "add $16, %1\n\t" "decl %%ecx\n\t" "jnz 0b\n" "\n\t" "femms" : "+r"(resultPtr), "+r"(left), "+r"(right) :: "ecx", "mm0", "mm1", "memory" ); memcpy(self->_values, result, 16 * sizeof(float)); } static void transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, |
︙ | ︙ |