37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
-
+
-
-
-
+
+
+
-
+
-
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
+
+
+
-
-
+
+
+
|
static void
transformVectors_SSE(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
size_t count)
{
OF_ALIGN(16) float tmp[4];
__asm__ __volatile__ (
"test %0, %0\n\t"
"test %[count], %[count]\n\t"
"jz 0f\n"
"\n\t"
"movaps (%2), %%xmm0\n\t"
"movaps 16(%2), %%xmm1\n\t"
"movaps 32(%2), %%xmm2\n\t"
"movaps (%[matrix]), %%xmm0\n\t"
"movaps 16(%[matrix]), %%xmm1\n\t"
"movaps 32(%[matrix]), %%xmm2\n\t"
# ifdef OF_AMD64
"movaps 48(%2), %%xmm8\n"
"movaps 48(%[matrix]), %%xmm8\n"
# endif
"\n\t"
"0:\n\t"
"movaps (%1), %%xmm3\n"
"movaps (%[vectors]), %%xmm3\n"
"\n\t"
"movaps %%xmm0, %%xmm4\n\t"
"mulps %%xmm3, %%xmm4\n\t"
"movaps %%xmm4, (%3)\n\t"
"addss 4(%3), %%xmm4\n\t"
"addss 8(%3), %%xmm4\n\t"
"addss 12(%3), %%xmm4\n"
"movaps %%xmm4, (%[tmp])\n\t"
"addss 4(%[tmp]), %%xmm4\n\t"
"addss 8(%[tmp]), %%xmm4\n\t"
"addss 12(%[tmp]), %%xmm4\n"
"\n\t"
"movaps %%xmm1, %%xmm5\n\t"
"mulps %%xmm3, %%xmm5\n\t"
"movaps %%xmm5, (%3)\n\t"
"addss 4(%3), %%xmm5\n\t"
"addss 8(%3), %%xmm5\n\t"
"addss 12(%3), %%xmm5\n"
"movaps %%xmm5, (%[tmp])\n\t"
"addss 4(%[tmp]), %%xmm5\n\t"
"addss 8(%[tmp]), %%xmm5\n\t"
"addss 12(%[tmp]), %%xmm5\n"
"\n\t"
"movaps %%xmm2, %%xmm6\n\t"
"mulps %%xmm3, %%xmm6\n\t"
"movaps %%xmm6, (%3)\n\t"
"addss 4(%3), %%xmm6\n\t"
"addss 8(%3), %%xmm6\n\t"
"addss 12(%3), %%xmm6\n"
"movaps %%xmm6, (%[tmp])\n\t"
"addss 4(%[tmp]), %%xmm6\n\t"
"addss 8(%[tmp]), %%xmm6\n\t"
"addss 12(%[tmp]), %%xmm6\n"
"\n\t"
# ifdef OF_AMD64
"movaps %%xmm8, %%xmm7\n\t"
# else
"movaps 48(%2), %%xmm7\n\t"
"movaps 48(%[matrix]), %%xmm7\n\t"
# endif
"mulps %%xmm3, %%xmm7\n\t"
"movaps %%xmm7, (%3)\n\t"
"addss 4(%3), %%xmm7\n\t"
"addss 8(%3), %%xmm7\n\t"
"addss 12(%3), %%xmm7\n"
"movaps %%xmm7, (%[tmp])\n\t"
"addss 4(%[tmp]), %%xmm7\n\t"
"addss 8(%[tmp]), %%xmm7\n\t"
"addss 12(%[tmp]), %%xmm7\n"
"\n\t"
"movss %%xmm4, (%1)\n\t"
"movss %%xmm5, 4(%1)\n\t"
"movss %%xmm6, 8(%1)\n\t"
"movss %%xmm7, 12(%1)\n"
"movss %%xmm4, (%[vectors])\n\t"
"movss %%xmm5, 4(%[vectors])\n\t"
"movss %%xmm6, 8(%[vectors])\n\t"
"movss %%xmm7, 12(%[vectors])\n"
"\n\t"
"add $16, %1\n\t"
"dec %0\n\t"
"add $16, %[vectors]\n\t"
"dec %[count]\n\t"
"jnz 0b\n"
: [count] "+r" (count),
: "+r"(count), "+r"(vectors)
: "r"(self->_values), "r"(&tmp)
[vectors] "+r" (vectors)
: [matrix] "r" (self->_values),
[tmp] "r" (&tmp)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
# ifdef OF_AMD64
"xmm8",
# endif
"memory"
);
}
|
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
|
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
|
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
-
+
-
-
+
+
-
-
+
+
+
+
+
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
-
-
+
+
-
-
+
+
-
+
-
-
+
+
+
-
-
+
+
|
float (*left)[4] = matrix->_values, (*right)[4] = self->_values;
float result[4][4], (*resultPtr)[4] = result;
__asm__ __volatile__ (
"movl $4, %%ecx\n\t"
"\n\t"
"0:\n\t"
"movd (%2), %%mm0\n\t"
"punpckldq 16(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 32(%2), %%mm1\n\t"
"punpckldq 48(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"movd (%[right]), %%mm0\n\t"
"punpckldq 16(%[right]), %%mm0\n\t"
"pfmul (%[left]), %%mm0\n\t"
"movd 32(%[right]), %%mm1\n\t"
"punpckldq 48(%[right]), %%mm1\n\t"
"pfmul 8(%[left]), %%mm1\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfacc %%mm0, %%mm0\n\t"
"movd %%mm0, (%0)\n\t"
"movd 4(%2), %%mm0\n\t"
"punpckldq 20(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 36(%2), %%mm1\n\t"
"punpckldq 52(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"movd %%mm0, (%[result])\n\t"
"movd 4(%[right]), %%mm0\n\t"
"punpckldq 20(%[right]), %%mm0\n\t"
"pfmul (%[left]), %%mm0\n\t"
"movd 36(%[right]), %%mm1\n\t"
"punpckldq 52(%[right]), %%mm1\n\t"
"pfmul 8(%[left]), %%mm1\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfacc %%mm0, %%mm0\n\t"
"movd %%mm0, 4(%0)\n\t"
"movd 8(%2), %%mm0\n\t"
"punpckldq 24(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 40(%2), %%mm1\n\t"
"punpckldq 56(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"movd %%mm0, 4(%[result])\n\t"
"movd 8(%[right]), %%mm0\n\t"
"punpckldq 24(%[right]), %%mm0\n\t"
"pfmul (%[left]), %%mm0\n\t"
"movd 40(%[right]), %%mm1\n\t"
"punpckldq 56(%[right]), %%mm1\n\t"
"pfmul 8(%[left]), %%mm1\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfacc %%mm0, %%mm0\n\t"
"movd %%mm0, 8(%0)\n\t"
"movd 12(%2), %%mm0\n\t"
"punpckldq 28(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 44(%2), %%mm1\n\t"
"punpckldq 60(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"movd %%mm0, 8(%[result])\n\t"
"movd 12(%[right]), %%mm0\n\t"
"punpckldq 28(%[right]), %%mm0\n\t"
"pfmul (%[left]), %%mm0\n\t"
"movd 44(%[right]), %%mm1\n\t"
"punpckldq 60(%[right]), %%mm1\n\t"
"pfmul 8(%[left]), %%mm1\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfacc %%mm0, %%mm0\n\t"
"movd %%mm0, 12(%0)\n"
"movd %%mm0, 12(%[result])\n"
"\n\t"
"add $16, %0\n\t"
"add $16, %1\n\t"
"add $16, %[result]\n\t"
"add $16, %[left]\n\t"
"decl %%ecx\n\t"
"jnz 0b\n"
"\n\t"
"femms"
: "+r"(resultPtr), "+r"(left), "+r"(right)
:: "ecx", "mm0", "mm1", "memory"
: [result] "+r" (resultPtr),
[left] "+r" (left),
[right] "+r" (right)
:
: "ecx", "mm0", "mm1", "memory"
);
memcpy(self->_values, result, 16 * sizeof(float));
}
static void
transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
size_t count)
{
__asm__ __volatile__ (
"test %0, %0\n\t"
"test %[count], %[count]\n\t"
"jz 0f\n"
"\n\t"
"0:\n\t"
"movq (%1), %%mm0\n\t"
"movq 8(%1), %%mm1\n"
"movq (%[vectors]), %%mm0\n\t"
"movq 8(%[vectors]), %%mm1\n"
"\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm1, %%mm3\n\t"
"pfmul (%2), %%mm2\n\t"
"pfmul 8(%2), %%mm3\n\t"
"pfmul (%[matrix]), %%mm2\n\t"
"pfmul 8(%[matrix]), %%mm3\n\t"
"pfacc %%mm3, %%mm2\n\t"
"pfacc %%mm2, %%mm2\n\t"
"\n\t"
"movq %%mm0, %%mm3\n\t"
"movq %%mm1, %%mm4\n\t"
"pfmul 16(%2), %%mm3\n\t"
"pfmul 24(%2), %%mm4\n\t"
"pfmul 16(%[matrix]), %%mm3\n\t"
"pfmul 24(%[matrix]), %%mm4\n\t"
"pfacc %%mm4, %%mm3\n\t"
"pfacc %%mm3, %%mm3\n\t"
"\n\t"
"punpckldq %%mm3, %%mm2\n\t"
"movq %%mm2, (%1)\n"
"movq %%mm2, (%[vectors])\n"
"\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm1, %%mm3\n\t"
"pfmul 32(%2), %%mm2\n\t"
"pfmul 40(%2), %%mm3\n\t"
"pfmul 32(%[matrix]), %%mm2\n\t"
"pfmul 40(%[matrix]), %%mm3\n\t"
"pfacc %%mm3, %%mm2\n\t"
"pfacc %%mm2, %%mm2\n\t"
"\n\t"
"pfmul 48(%2), %%mm0\n\t"
"pfmul 56(%2), %%mm1\n\t"
"pfmul 48(%[matrix]), %%mm0\n\t"
"pfmul 56(%[matrix]), %%mm1\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfacc %%mm0, %%mm0\n\t"
"\n\t"
"punpckldq %%mm0, %%mm2\n\t"
"movq %%mm2, 8(%1)\n"
"movq %%mm2, 8(%[vectors])\n"
"\n\t"
"add $16, %1\n\t"
"dec %0\n\t"
"add $16, %[vectors]\n\t"
"dec %[count]\n\t"
"jnz 0b\n"
"\n\t"
"0:\n\t"
"femms"
: [count] "+r" (count),
: "+r"(count), "+r"(vectors)
: "r"(self->_values)
[vectors] "+r" (vectors)
: [matrix] "r" (self->_values)
: "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
);
}
# ifndef __clang__
# pragma GCC pop_options
# endif
|