ObjFW  Check-in [cf955413ab]

Overview
Comment:OFMatrix4x4: SSE1 for -[transformVectors:count:]

This new SSE1 implementation is better than the SSE4.1 implementation,
hence this also deletes the SSE4.1 implementation.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: cf955413ab50878499914c5bdda3bd1d2a2cb22085165a8747a4e5b015020b05
User & Date: js on 2023-11-06 00:59:37
Other Links: manifest | tags
Context
2023-11-06
20:11
OFMatrix4x4: Fix missing vector reload in SSE check-in: 9ba7594f7b user: js tags: trunk
00:59
OFMatrix4x4: SSE1 for -[transformVectors:count:] check-in: cf955413ab user: js tags: trunk
2023-11-05
18:18
OFMatrix4x4: Unroll inner loop in 3DNow! version check-in: 3280466d35 user: js tags: trunk
Changes

Modified src/OFMatrix4x4.m from [b31d5e9464] to [ac56c8b767].

28
29
30
31
32
33
34
35

36
37
38

39
40


41
42
43
44
45
46
47
48

49
50

51
52
53
54
55
56
57
58
59
60
61
62































63
64
65
66
67
68

69
70
71
72
73
74
75
28
29
30
31
32
33
34

35
36
37

38
39
40
41
42
43
44
45
46
47
48
49

50
51
52
53












54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

90
91
92
93
94
95
96
97







-
+


-
+


+
+







-
+


+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+





-
+







	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("sse4.1")
#  pragma GCC target("sse")
# endif
static void
transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
transformVectors_SSE(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{
	OF_ALIGN(16) float tmp[4];

	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	48(%2), %%xmm3\n"
	    "movaps	(%1), %%xmm3\n"
	    "\n\t"
	    "0:\n\t"
	    "movaps	%%xmm0, %%xmm4\n\t"
	    "movaps	(%1), %%xmm4\n\t"
	    "movaps	%%xmm4, %%xmm5\n\t"
	    "dpps	$0xFF, %%xmm0, %%xmm4\n\t"
	    "movaps	%%xmm5, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm1, %%xmm5\n\t"
	    "movaps	%%xmm6, %%xmm7\n\t"
	    "dpps	$0xFF, %%xmm2, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm3, %%xmm7\n\t"
	    "insertps	$0x10, %%xmm5, %%xmm4\n\t"
	    "insertps	$0x20, %%xmm6, %%xmm4\n\t"
	    "insertps	$0x30, %%xmm7, %%xmm4\n\t"
	    "movaps	%%xmm4, (%1)\n"
	    "mulps	%%xmm3, %%xmm4\n\t"
	    "movaps	%%xmm4, (%3)\n\t"
	    "addss	4(%3), %%xmm4\n\t"
	    "addss	8(%3), %%xmm4\n\t"
	    "addss	12(%3), %%xmm4\n"
	    "\n\t"
	    "movaps	%%xmm1, %%xmm5\n\t"
	    "mulps	%%xmm3, %%xmm5\n\t"
	    "movaps	%%xmm5, (%3)\n\t"
	    "addss	4(%3), %%xmm5\n\t"
	    "addss	8(%3), %%xmm5\n\t"
	    "addss	12(%3), %%xmm5\n"
	    "\n\t"
	    "movaps	%%xmm2, %%xmm6\n\t"
	    "mulps	%%xmm3, %%xmm6\n\t"
	    "movaps	%%xmm6, (%3)\n\t"
	    "addss	4(%3), %%xmm6\n\t"
	    "addss	8(%3), %%xmm6\n\t"
	    "addss	12(%3), %%xmm6\n"
	    "\n\t"
	    "movaps	48(%2), %%xmm7\n\t"
	    "mulps	%%xmm3, %%xmm7\n\t"
	    "movaps	%%xmm7, (%3)\n\t"
	    "addss	4(%3), %%xmm7\n\t"
	    "addss	8(%3), %%xmm7\n\t"
	    "addss	12(%3), %%xmm7\n"
	    "\n\t"
	    "movss	%%xmm4, (%1)\n\t"
	    "movss	%%xmm5, 4(%1)\n\t"
	    "movss	%%xmm6, 8(%1)\n\t"
	    "movss	%%xmm7, 12(%1)\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "jnz	0b\n"
	    : "+r"(count), "+r"(vectors)
	    : "r"(self->_values)
	    : "r"(self->_values), "r"(&tmp)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	      "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif
205
206
207
208
209
210
211
212

213
214

215
216
217
218
219
220
221
227
228
229
230
231
232
233

234
235

236
237
238
239
240
241
242
243







-
+

-
+







		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supportsSSE41]) {
	if ([OFSystemInfo supportsSSE]) {
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_SSE41)
		    transformVectors_SSE)
	} else if ([OFSystemInfo supports3DNow]) {
		REPLACE(@selector(multiplyWithMatrix:),
		    multiplyWithMatrix_3DNow)
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_3DNow)
	}