ObjFW  Check-in [3280466d35]

Overview
Comment:OFMatrix4x4: Unroll inner loop in 3DNow! version

This results in a ~ 16% performance improvement on a Duron 750.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 3280466d358455183daebacdccb7be910365812353167bc870bdc49fbdc7acdb
User & Date: js on 2023-11-05 18:18:05
Other Links: manifest | tags
Context
2023-11-06
00:59
OFMatrix4x4: SSE1 for -[transformVectors:count:] check-in: cf955413ab user: js tags: trunk
2023-11-05
18:18
OFMatrix4x4: Unroll inner loop in 3DNow! version check-in: 3280466d35 user: js tags: trunk
13:37
OFMatrix4x4: Improve 3DNow! implementation check-in: 34b48a4208 user: js tags: trunk
Changes

Modified src/OFMatrix4x4.m from [66a4a58f6e] to [b31d5e9464].

81
82
83
84
85
86
87
88

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105




























106
107
108


109
110

111
112

113
114
115
116

117
118
119
120
121
122
123
81
82
83
84
85
86
87

88
89
90
91
92
93
94
95
96
97
98







99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127


128
129


130


131
132
133
134

135
136
137
138
139
140
141
142







-
+










-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
-
+
+
-
-
+
-
-
+



-
+







static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
	float (*left)[4] = matrix->_values, (*right)[4] = self->_values;
	float result[4][4], (*resultPtr)[4] = result;

	__asm__ __volatile__ (
	    "xorw	%%cx, %%cx\n"
	    "movl	$4, %%ecx\n\t"
	    "\n\t"
	    "0:\n\t"
	    "movd	(%2), %%mm0\n\t"
	    "punpckldq	16(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	32(%2), %%mm1\n\t"
	    "punpckldq  48(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, (%0)\n"
	    "\n\t"
	    "add	$4, %0\n\t"
	    "add	$4, %2\n\t"
	    "incb	%%cl\n\t"
	    "cmpb	$4, %%cl\n\t"
	    "jb		0b\n"
	    "movd	%%mm0, (%0)\n\t"
	    "movd	4(%2), %%mm0\n\t"
	    "punpckldq	20(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	36(%2), %%mm1\n\t"
	    "punpckldq  52(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 4(%0)\n\t"
	    "movd	8(%2), %%mm0\n\t"
	    "punpckldq	24(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	40(%2), %%mm1\n\t"
	    "punpckldq  56(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 8(%0)\n\t"
	    "movd	12(%2), %%mm0\n\t"
	    "punpckldq	28(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	44(%2), %%mm1\n\t"
	    "punpckldq  60(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 12(%0)\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "sub	$16, %2\n\t"
	    "add	$16, %0\n\t"
	    "add	$16, %1\n\t"
	    "xorb	%%cl, %%cl\n\t"
	    "incb	%%ch\n\t"
	    "decl	%%ecx\n\t"
	    "cmpb	$4, %%ch\n\t"
	    "jb		0b\n"
	    "jnz	0b\n"
	    "\n\t"
	    "femms"
	    : "+r"(resultPtr), "+r"(left), "+r"(right)
	    :: "cx", "mm0", "mm1", "memory"
	    :: "ecx", "mm0", "mm1", "memory"
	);

	memcpy(self->_values, result, 16 * sizeof(float));
}

static void
transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,