28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
-
+
-
+
+
+
-
+
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
|
{ 0, 0, 0, 1 }
};
@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
# pragma GCC push_options
# pragma GCC target("sse4.1")
# pragma GCC target("sse")
# endif
static void
transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
transformVectors_SSE(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
size_t count)
{
OF_ALIGN(16) float tmp[4];
__asm__ __volatile__ (
"test %0, %0\n\t"
"jz 0f\n"
"\n\t"
"movaps (%2), %%xmm0\n\t"
"movaps 16(%2), %%xmm1\n\t"
"movaps 32(%2), %%xmm2\n\t"
"movaps 48(%2), %%xmm3\n"
"movaps (%1), %%xmm3\n"
"\n\t"
"0:\n\t"
"movaps %%xmm0, %%xmm4\n\t"
"movaps (%1), %%xmm4\n\t"
"movaps %%xmm4, %%xmm5\n\t"
"dpps $0xFF, %%xmm0, %%xmm4\n\t"
"movaps %%xmm5, %%xmm6\n\t"
"dpps $0xFF, %%xmm1, %%xmm5\n\t"
"movaps %%xmm6, %%xmm7\n\t"
"dpps $0xFF, %%xmm2, %%xmm6\n\t"
"dpps $0xFF, %%xmm3, %%xmm7\n\t"
"insertps $0x10, %%xmm5, %%xmm4\n\t"
"insertps $0x20, %%xmm6, %%xmm4\n\t"
"insertps $0x30, %%xmm7, %%xmm4\n\t"
"movaps %%xmm4, (%1)\n"
"mulps %%xmm3, %%xmm4\n\t"
"movaps %%xmm4, (%3)\n\t"
"addss 4(%3), %%xmm4\n\t"
"addss 8(%3), %%xmm4\n\t"
"addss 12(%3), %%xmm4\n"
"\n\t"
"movaps %%xmm1, %%xmm5\n\t"
"mulps %%xmm3, %%xmm5\n\t"
"movaps %%xmm5, (%3)\n\t"
"addss 4(%3), %%xmm5\n\t"
"addss 8(%3), %%xmm5\n\t"
"addss 12(%3), %%xmm5\n"
"\n\t"
"movaps %%xmm2, %%xmm6\n\t"
"mulps %%xmm3, %%xmm6\n\t"
"movaps %%xmm6, (%3)\n\t"
"addss 4(%3), %%xmm6\n\t"
"addss 8(%3), %%xmm6\n\t"
"addss 12(%3), %%xmm6\n"
"\n\t"
"movaps 48(%2), %%xmm7\n\t"
"mulps %%xmm3, %%xmm7\n\t"
"movaps %%xmm7, (%3)\n\t"
"addss 4(%3), %%xmm7\n\t"
"addss 8(%3), %%xmm7\n\t"
"addss 12(%3), %%xmm7\n"
"\n\t"
"movss %%xmm4, (%1)\n\t"
"movss %%xmm5, 4(%1)\n\t"
"movss %%xmm6, 8(%1)\n\t"
"movss %%xmm7, 12(%1)\n"
"\n\t"
"add $16, %1\n\t"
"dec %0\n\t"
"jnz 0b\n"
: "+r"(count), "+r"(vectors)
: "r"(self->_values)
: "r"(self->_values), "r"(&tmp)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"memory"
);
}
# ifndef __clang__
# pragma GCC pop_options
# endif
|
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
|
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
|
-
+
-
+
|
return;
# define REPLACE(selector, func) \
typeEncoding = method_getTypeEncoding( \
class_getInstanceMethod(self, selector)); \
class_replaceMethod(self, selector, (IMP)func, typeEncoding);
if ([OFSystemInfo supportsSSE41]) {
if ([OFSystemInfo supportsSSE]) {
REPLACE(@selector(transformVectors:count:),
transformVectors_SSE41)
transformVectors_SSE)
} else if ([OFSystemInfo supports3DNow]) {
REPLACE(@selector(multiplyWithMatrix:),
multiplyWithMatrix_3DNow)
REPLACE(@selector(transformVectors:count:),
transformVectors_3DNow)
}
|