34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
-
-
-
-
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
-
-
-
-
|
# pragma GCC push_options
# pragma GCC target("3dnow")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
OFMatrix4x4 *matrix)
{
float *left = &matrix->_values[0][0], *right = &self->_values[0][0];
float result[4][4];
float result[4][4], *resultPtr = &result[0][0];
for (uint_fast8_t i = 0; i < 4; i++) {
for (uint_fast8_t j = 0; j < 4; j++) {
__asm__ __volatile__ (
"movd (%2), %%mm0\n\t"
"punpckldq 16(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 32(%2), %%mm1\n\t"
"punpckldq 48(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"pswapd %%mm0, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movd %%mm0, %0"
:: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
__asm__ __volatile__ (
"xorw %%cx, %%cx\n"
"\n\t"
"0:\n\t"
"movd (%2), %%mm0\n\t"
"punpckldq 16(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 32(%2), %%mm1\n\t"
"punpckldq 48(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"pswapd %%mm0, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movd %%mm0, (%0)\n"
"\n\t"
"add $4, %0\n\t"
"add $4, %2\n\t"
"incb %%cl\n\t"
"cmpb $4, %%cl\n\t"
"jb 0b\n"
"\n\t"
"add $16, %1\n\t"
"sub $16, %2\n\t"
"xorb %%cl, %%cl\n\t"
"incb %%ch\n\t"
"cmpb $4, %%ch\n\t"
"jb 0b\n"
"\n\t"
"femms"
: "+r"(resultPtr), "+r"(left), "+r"(right)
"r"(&self->_values[0][j])
: "mm0", "mm1", "memory"
);
:: "cx", "mm0", "mm1", "memory"
);
}
}
__asm__ __volatile__ ("femms");
memcpy(self->_values, result, sizeof(result));
}
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
float *left = &matrix->_values[0][0], *right = &self->_values[0][0];
float result[4][4];
float result[4][4], *resultPtr = &result[0][0];
for (uint_fast8_t i = 0; i < 4; i++) {
for (uint_fast8_t j = 0; j < 4; j++) {
__asm__ __volatile__ (
"movd (%2), %%mm0\n\t"
"punpckldq 16(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 32(%2), %%mm1\n\t"
"punpckldq 48(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movq %%mm0, %%mm1\n\t"
"psrlq $32, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movd %%mm0, %0"
:: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
__asm__ __volatile__ (
"xorw %%cx, %%cx\n"
"\n\t"
"0:\n\t"
"movd (%2), %%mm0\n\t"
"punpckldq 16(%2), %%mm0\n\t"
"pfmul (%1), %%mm0\n\t"
"movd 32(%2), %%mm1\n\t"
"punpckldq 48(%2), %%mm1\n\t"
"pfmul 8(%1), %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movq %%mm0, %%mm1\n\t"
"psrlq $32, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movd %%mm0, (%0)\n"
"\n\t"
"add $4, %0\n\t"
"add $4, %2\n\t"
"incb %%cl\n\t"
"cmpb $4, %%cl\n\t"
"jb 0b\n"
"\n\t"
"add $16, %1\n\t"
"sub $16, %2\n\t"
"xorb %%cl, %%cl\n\t"
"incb %%ch\n\t"
"cmpb $4, %%ch\n\t"
"jb 0b\n"
"\n\t"
"femms"
: "+r"(resultPtr), "+r"(left), "+r"(right)
"r"(&self->_values[0][j])
: "mm0", "mm1", "memory"
);
:: "cx", "mm0", "mm1", "memory"
);
}
}
__asm__ __volatile__ ("femms");
memcpy(self->_values, result, sizeof(result));
}
static void
transformVectors_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
size_t count)
|