ObjFW  Check-in [3280466d35]

Overview
Comment:OFMatrix4x4: Unroll inner loop in 3DNow! version

This results in a ~ 16% performance improvement on a Duron 750.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 3280466d358455183daebacdccb7be910365812353167bc870bdc49fbdc7acdb
User & Date: js on 2023-11-05 18:18:05
Other Links: manifest | tags
Context
2023-11-06
00:59
OFMatrix4x4: SSE1 for -[transformVectors:count:] check-in: cf955413ab user: js tags: trunk
2023-11-05
18:18
OFMatrix4x4: Unroll inner loop in 3DNow! version check-in: 3280466d35 user: js tags: trunk
13:37
OFMatrix4x4: Improve 3DNow! implementation check-in: 34b48a4208 user: js tags: trunk
Changes

Modified src/OFMatrix4x4.m from [66a4a58f6e] to [b31d5e9464].

81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

101





102
103







104








105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
	float (*left)[4] = matrix->_values, (*right)[4] = self->_values;
	float result[4][4], (*resultPtr)[4] = result;

	__asm__ __volatile__ (
	    "xorw	%%cx, %%cx\n"
	    "\n\t"
	    "0:\n\t"
	    "movd	(%2), %%mm0\n\t"
	    "punpckldq	16(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	32(%2), %%mm1\n\t"
	    "punpckldq  48(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, (%0)\n"
	    "\n\t"

	    "add	$4, %0\n\t"





	    "add	$4, %2\n\t"
	    "incb	%%cl\n\t"







	    "cmpb	$4, %%cl\n\t"








	    "jb		0b\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "sub	$16, %2\n\t"
	    "xorb	%%cl, %%cl\n\t"
	    "incb	%%ch\n\t"
	    "cmpb	$4, %%ch\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "femms"
	    : "+r"(resultPtr), "+r"(left), "+r"(right)
	    :: "cx", "mm0", "mm1", "memory"
	);

	memcpy(self->_values, result, 16 * sizeof(float));
}

static void
transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,







|










|
|
>
|
>
>
>
>
>
|
|
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
|

|
|
<
|
<
|



|







81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

130

131
132
133
134
135
136
137
138
139
140
141
142
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
	float (*left)[4] = matrix->_values, (*right)[4] = self->_values;
	float result[4][4], (*resultPtr)[4] = result;

	__asm__ __volatile__ (
	    "movl	$4, %%ecx\n\t"
	    "\n\t"
	    "0:\n\t"
	    "movd	(%2), %%mm0\n\t"
	    "punpckldq	16(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	32(%2), %%mm1\n\t"
	    "punpckldq  48(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, (%0)\n\t"
	    "movd	4(%2), %%mm0\n\t"
	    "punpckldq	20(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	36(%2), %%mm1\n\t"
	    "punpckldq  52(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 4(%0)\n\t"
	    "movd	8(%2), %%mm0\n\t"
	    "punpckldq	24(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	40(%2), %%mm1\n\t"
	    "punpckldq  56(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 8(%0)\n\t"
	    "movd	12(%2), %%mm0\n\t"
	    "punpckldq	28(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	44(%2), %%mm1\n\t"
	    "punpckldq  60(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 12(%0)\n"
	    "\n\t"
	    "add	$16, %0\n\t"
	    "add	$16, %1\n\t"

	    "decl	%%ecx\n\t"

	    "jnz	0b\n"
	    "\n\t"
	    "femms"
	    : "+r"(resultPtr), "+r"(left), "+r"(right)
	    :: "ecx", "mm0", "mm1", "memory"
	);

	memcpy(self->_values, result, 16 * sizeof(float));
}

static void
transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,