Differences From Artifact [43a6889a80]:
- File src/OFMatrix4x4.m — part of check-in [ad6b3e6442] at 2023-11-01 20:54:36 on branch trunk — Add -[OFMatrix4x4 transformVectors:count:] (user: js, size: 9504) [annotate] [blame] [check-ins using]
To Artifact [b42a0641ea]:
- File
src/OFMatrix4x4.m
— part of check-in
[f949f7775b]
at
2023-11-01 21:22:10
on branch trunk
— OFMatrix4x4: Use __asm__ __volatile__
The outputs aren't consumed as they are used as inputs that can be
modified. (user: js, size: 9582) [annotate] [blame] [check-ins using]
︙ | ︙ | |||
38 39 40 41 42 43 44 | multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float result[4][4]; for (uint_fast8_t i = 0; i < 4; i++) { for (uint_fast8_t j = 0; j < 4; j++) { | | | | | | | 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float result[4][4]; for (uint_fast8_t i = 0; i < 4; i++) { for (uint_fast8_t j = 0; j < 4; j++) { __asm__ __volatile__ ( "movd (%2), %%mm0\n\t" "punpckldq 16(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 32(%2), %%mm1\n\t" "punpckldq 48(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfadd %%mm1, %%mm0\n\t" "pswapd %%mm0, %%mm1\n\t" "pfadd %%mm1, %%mm0\n\t" "movd %%mm0, %0" :: "m"(result[i][j]), "r"(&matrix->_values[i][0]), "r"(&self->_values[0][j]) : "mm0", "mm1", "memory" ); } } __asm__ __volatile__ ("femms"); memcpy(self->_values, result, sizeof(result)); } static void multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float result[4][4]; for (uint_fast8_t i = 0; i < 4; i++) { for (uint_fast8_t j = 0; j < 4; j++) { __asm__ __volatile__ ( "movd (%2), %%mm0\n\t" "punpckldq 16(%2), %%mm0\n\t" "pfmul (%1), %%mm0\n\t" "movd 32(%2), %%mm1\n\t" "punpckldq 48(%2), %%mm1\n\t" "pfmul 8(%1), %%mm1\n\t" "pfadd %%mm1, %%mm0\n\t" "movq %%mm0, %%mm1\n\t" "psrlq $32, %%mm1\n\t" "pfadd %%mm1, %%mm0\n\t" "movd %%mm0, %0" :: "m"(result[i][j]), "r"(&matrix->_values[i][0]), "r"(&self->_values[0][j]) : "mm0", "mm1", "memory" ); } } __asm__ __volatile__ ("femms"); memcpy(self->_values, result, sizeof(result)); } static void transformVectors_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { __asm__ __volatile__ ( "0:\n\t" "test %0, %0\n\t" "jz 0f\n" "\n\t" "movq (%1), %%mm0\n\t" "movq 8(%1), %%mm1\n" "\n\t" |
︙ | ︙ | |||
155 156 157 158 159 160 161 | ); } static void transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { | | | 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | ); } static void transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { __asm__ __volatile__ ( "0:\n\t" "test %0, %0\n\t" "jz 0f\n" "\n\t" "movq (%1), %%mm0\n\t" "movq 8(%1), %%mm1\n" "\n\t" |
︙ | ︙ |