ObjFW  Diff

Differences From Artifact [b31d5e9464]:

To Artifact [ac56c8b767]:


28
29
30
31
32
33
34
35
36
37
38
39
40


41
42
43
44
45
46
47
48
49
50

51
52
53





54
55





56
57




58

59
60
61

62



63
64
65
66
67
68
69
70
71
72
73
74
75
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("sse4.1")
# endif
static void
transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{


	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	48(%2), %%xmm3\n"
	    "\n\t"
	    "0:\n\t"

	    "movaps	(%1), %%xmm4\n\t"
	    "movaps	%%xmm4, %%xmm5\n\t"
	    "dpps	$0xFF, %%xmm0, %%xmm4\n\t"





	    "movaps	%%xmm5, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm1, %%xmm5\n\t"





	    "movaps	%%xmm6, %%xmm7\n\t"
	    "dpps	$0xFF, %%xmm2, %%xmm6\n\t"




	    "dpps	$0xFF, %%xmm3, %%xmm7\n\t"

	    "insertps	$0x10, %%xmm5, %%xmm4\n\t"
	    "insertps	$0x20, %%xmm6, %%xmm4\n\t"
	    "insertps	$0x30, %%xmm7, %%xmm4\n\t"

	    "movaps	%%xmm4, (%1)\n"



	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "jnz	0b\n"
	    : "+r"(count), "+r"(vectors)
	    : "r"(self->_values)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	      "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif







|


|


>
>







|


>
|
|
|
>
>
>
>
>
|
|
>
>
>
>
>
|
|
>
>
>
>
|
>
|
|
|
>
|
>
>
>





|







28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("sse")
# endif
static void
transformVectors_SSE(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{
	OF_ALIGN(16) float tmp[4];

	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	(%1), %%xmm3\n"
	    "\n\t"
	    "0:\n\t"
	    "movaps	%%xmm0, %%xmm4\n\t"
	    "mulps	%%xmm3, %%xmm4\n\t"
	    "movaps	%%xmm4, (%3)\n\t"
	    "addss	4(%3), %%xmm4\n\t"
	    "addss	8(%3), %%xmm4\n\t"
	    "addss	12(%3), %%xmm4\n"
	    "\n\t"
	    "movaps	%%xmm1, %%xmm5\n\t"
	    "mulps	%%xmm3, %%xmm5\n\t"
	    "movaps	%%xmm5, (%3)\n\t"
	    "addss	4(%3), %%xmm5\n\t"
	    "addss	8(%3), %%xmm5\n\t"
	    "addss	12(%3), %%xmm5\n"
	    "\n\t"
	    "movaps	%%xmm2, %%xmm6\n\t"
	    "mulps	%%xmm3, %%xmm6\n\t"
	    "movaps	%%xmm6, (%3)\n\t"
	    "addss	4(%3), %%xmm6\n\t"
	    "addss	8(%3), %%xmm6\n\t"
	    "addss	12(%3), %%xmm6\n"
	    "\n\t"
	    "movaps	48(%2), %%xmm7\n\t"
	    "mulps	%%xmm3, %%xmm7\n\t"
	    "movaps	%%xmm7, (%3)\n\t"
	    "addss	4(%3), %%xmm7\n\t"
	    "addss	8(%3), %%xmm7\n\t"
	    "addss	12(%3), %%xmm7\n"
	    "\n\t"
	    "movss	%%xmm4, (%1)\n\t"
	    "movss	%%xmm5, 4(%1)\n\t"
	    "movss	%%xmm6, 8(%1)\n\t"
	    "movss	%%xmm7, 12(%1)\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "jnz	0b\n"
	    : "+r"(count), "+r"(vectors)
	    : "r"(self->_values), "r"(&tmp)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	      "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supportsSSE41]) {
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_SSE41)
	} else if ([OFSystemInfo supports3DNow]) {
		REPLACE(@selector(multiplyWithMatrix:),
		    multiplyWithMatrix_3DNow)
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_3DNow)
	}








|

|







227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supportsSSE]) {
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_SSE)
	} else if ([OFSystemInfo supports3DNow]) {
		REPLACE(@selector(multiplyWithMatrix:),
		    multiplyWithMatrix_3DNow)
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_3DNow)
	}