25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
-
+
-
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
-
-
+
+
|
{ 1, 0, 0, 0 },
{ 0, 1, 0, 0 },
{ 0, 0, 1, 0 },
{ 0, 0, 0, 1 }
};
@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(HAVE_INTEL_SYNTAX)
#if defined(OF_AMD64) || defined(OF_X86)
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
float result[4][4] = {{ 0 }};
float result[4][4];
for (uint_fast8_t i = 0; i < 4; i++) {
for (uint_fast8_t j = 0; j < 4; j++) {
__asm__ (
"movd mm0, [%2]\n\t"
"punpckldq mm0, [%2 + 16]\n\t"
"pfmul mm0, [%1]\n\t"
"movd mm1, [%2 + 32]\n\t"
"punpckldq mm1, [%2 + 48]\n\t"
"pfmul mm1, [%1 + 8]\n\t"
"pfadd mm0, mm1\n\t"
"movq mm1, mm0\n\t"
"psrlq mm1, 32\n\t"
"pfadd mm0, mm1\n\t"
"movd %0, mm0"
"movd (%2), %%mm0\n\t"
"punpckldq 16(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 32(%2), %%mm1\n\t"
"punpckldq 48(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movq %%mm0, %%mm1\n\t"
"psrlq $32, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movd %%mm0, %0"
:: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
"r"(&self->_values[0][j])
: "mm0", "mm1", "memory"
);
}
}
__asm__ ("femms");
memcpy(self->_values, result, sizeof(result));
}
static OFVector4D
transformedVector_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D vector)
{
OFVector4D result;
__asm__ (
"movq mm0, [%2]\n\t"
"movq mm1, [%2 + 8]\n"
"movq (%2), %%mm0\n\t"
"movq 8(%2), %%mm1\n"
"\n\t"
"movq mm2, mm0\n\t"
"movq mm3, mm1\n\t"
"pfmul mm2, [%1]\n\t"
"pfmul mm3, [%1 + 8]\n\t"
"pfadd mm2, mm3\n\t"
"movq mm3, mm2\n\t"
"psrlq mm3, 32\n\t"
"pfadd mm2, mm3\n"
"movq %%mm0, %%mm2\n\t"
"movq %%mm1, %%mm3\n\t"
"pfmul (%1), %%mm2\n\t"
"pfmul 8(%1), %%mm3\n\t"
"pfadd %%mm3, %%mm2\n\t"
"movq %%mm2, %%mm3\n\t"
"psrlq $32, %%mm3\n\t"
"pfadd %%mm3, %%mm2\n"
"\n\t"
"movq mm3, mm0\n\t"
"movq mm4, mm1\n\t"
"pfmul mm3, [%1 + 16]\n\t"
"pfmul mm4, [%1 + 24]\n\t"
"pfadd mm3, mm4\n\t"
"movq mm4, mm3\n\t"
"psrlq mm4, 32\n\t"
"pfadd mm3, mm4\n"
"movq %%mm0, %%mm3\n\t"
"movq %%mm1, %%mm4\n\t"
"pfmul 16(%1), %%mm3\n\t"
"pfmul 24(%1), %%mm4\n\t"
"pfadd %%mm4, %%mm3\n\t"
"movq %%mm3, %%mm4\n\t"
"psrlq $32, %%mm4\n\t"
"pfadd %%mm4, %%mm3\n"
"\n\t"
"punpckldq mm2, mm3\n\t"
"movq [%0], mm2\n"
"punpckldq %%mm3, %%mm2\n\t"
"movq %%mm2, (%0)\n"
"\n\t"
"movq mm2, mm0\n\t"
"movq mm3, mm1\n\t"
"pfmul mm2, [%1 + 32]\n\t"
"pfmul mm3, [%1 + 40]\n\t"
"pfadd mm2, mm3\n\t"
"movq mm3, mm2\n\t"
"psrlq mm3, 32\n\t"
"pfadd mm2, mm3\n"
"movq %%mm0, %%mm2\n\t"
"movq %%mm1, %%mm3\n\t"
"pfmul 32(%1), %%mm2\n\t"
"pfmul 40(%1), %%mm3\n\t"
"pfadd %%mm3, %%mm2\n\t"
"movq %%mm2, %%mm3\n\t"
"psrlq $32, %%mm3\n\t"
"pfadd %%mm3, %%mm2\n"
"\n\t"
"pfmul mm0, [%1 + 48]\n\t"
"pfmul mm1, [%1 + 56]\n\t"
"pfadd mm0, mm1\n\t"
"movq mm1, mm0\n\t"
"psrlq mm1, 32\n\t"
"pfadd mm0, mm1\n"
"pfmul 48(%1), %%mm0\n\t"
"pfmul 56(%1), %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movq %%mm0, %%mm1\n\t"
"psrlq $32, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n"
"\n\t"
"punpckldq mm2, mm0\n\t"
"movq [%0 + 8], mm2\n"
"punpckldq %%mm0, %%mm2\n\t"
"movq %%mm2, 8(%0)\n"
"\n\t"
"femms"
:: "r"(&result), "r"(&self->_values), "r"(&vector)
: "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
);
return result;
|