ObjFW  Check-in [cf955413ab]

Overview
Comment:OFMatrix4x4: SSE1 for -[transformVectors:count:]

This new SSE1 implementation is better than the SSE4.1 implementation,
hence this also deletes the SSE4.1 implementation.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: cf955413ab50878499914c5bdda3bd1d2a2cb22085165a8747a4e5b015020b05
User & Date: js on 2023-11-06 00:59:37
Other Links: manifest | tags
Context
2023-11-06
20:11
OFMatrix4x4: Fix missing vector reload in SSE check-in: 9ba7594f7b user: js tags: trunk
00:59
OFMatrix4x4: SSE1 for -[transformVectors:count:] check-in: cf955413ab user: js tags: trunk
2023-11-05
18:18
OFMatrix4x4: Unroll inner loop in 3DNow! version check-in: 3280466d35 user: js tags: trunk
Changes

Modified src/OFMatrix4x4.m from [b31d5e9464] to [ac56c8b767].

28
29
30
31
32
33
34
35
36
37
38
39
40


41
42
43
44
45
46
47
48
49
50

51
52
53





54
55





56
57




58

59
60
61

62



63
64
65
66
67
68
69
70
71
72
73
74
75
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("sse4.1")
# endif
static void
transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{


	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	48(%2), %%xmm3\n"
	    "\n\t"
	    "0:\n\t"

	    "movaps	(%1), %%xmm4\n\t"
	    "movaps	%%xmm4, %%xmm5\n\t"
	    "dpps	$0xFF, %%xmm0, %%xmm4\n\t"





	    "movaps	%%xmm5, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm1, %%xmm5\n\t"





	    "movaps	%%xmm6, %%xmm7\n\t"
	    "dpps	$0xFF, %%xmm2, %%xmm6\n\t"




	    "dpps	$0xFF, %%xmm3, %%xmm7\n\t"

	    "insertps	$0x10, %%xmm5, %%xmm4\n\t"
	    "insertps	$0x20, %%xmm6, %%xmm4\n\t"
	    "insertps	$0x30, %%xmm7, %%xmm4\n\t"

	    "movaps	%%xmm4, (%1)\n"



	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "jnz	0b\n"
	    : "+r"(count), "+r"(vectors)
	    : "r"(self->_values)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	      "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif







|


|


>
>







|


>
|
|
|
>
>
>
>
>
|
|
>
>
>
>
>
|
|
>
>
>
>
|
>
|
|
|
>
|
>
>
>





|







28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("sse")
# endif
static void
transformVectors_SSE(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{
	OF_ALIGN(16) float tmp[4];

	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	(%1), %%xmm3\n"
	    "\n\t"
	    "0:\n\t"
	    "movaps	%%xmm0, %%xmm4\n\t"
	    "mulps	%%xmm3, %%xmm4\n\t"
	    "movaps	%%xmm4, (%3)\n\t"
	    "addss	4(%3), %%xmm4\n\t"
	    "addss	8(%3), %%xmm4\n\t"
	    "addss	12(%3), %%xmm4\n"
	    "\n\t"
	    "movaps	%%xmm1, %%xmm5\n\t"
	    "mulps	%%xmm3, %%xmm5\n\t"
	    "movaps	%%xmm5, (%3)\n\t"
	    "addss	4(%3), %%xmm5\n\t"
	    "addss	8(%3), %%xmm5\n\t"
	    "addss	12(%3), %%xmm5\n"
	    "\n\t"
	    "movaps	%%xmm2, %%xmm6\n\t"
	    "mulps	%%xmm3, %%xmm6\n\t"
	    "movaps	%%xmm6, (%3)\n\t"
	    "addss	4(%3), %%xmm6\n\t"
	    "addss	8(%3), %%xmm6\n\t"
	    "addss	12(%3), %%xmm6\n"
	    "\n\t"
	    "movaps	48(%2), %%xmm7\n\t"
	    "mulps	%%xmm3, %%xmm7\n\t"
	    "movaps	%%xmm7, (%3)\n\t"
	    "addss	4(%3), %%xmm7\n\t"
	    "addss	8(%3), %%xmm7\n\t"
	    "addss	12(%3), %%xmm7\n"
	    "\n\t"
	    "movss	%%xmm4, (%1)\n\t"
	    "movss	%%xmm5, 4(%1)\n\t"
	    "movss	%%xmm6, 8(%1)\n\t"
	    "movss	%%xmm7, 12(%1)\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "jnz	0b\n"
	    : "+r"(count), "+r"(vectors)
	    : "r"(self->_values), "r"(&tmp)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	      "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supportsSSE41]) {
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_SSE41)
	} else if ([OFSystemInfo supports3DNow]) {
		REPLACE(@selector(multiplyWithMatrix:),
		    multiplyWithMatrix_3DNow)
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_3DNow)
	}








|

|







227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supportsSSE]) {
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_SSE)
	} else if ([OFSystemInfo supports3DNow]) {
		REPLACE(@selector(multiplyWithMatrix:),
		    multiplyWithMatrix_3DNow)
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_3DNow)
	}