ObjFW  Check-in [0d671245d4]

Overview
Comment:OFMatrix4x4: Move __asm__ out of loop
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 0d671245d4bb5953939c96cc4ffe77d989bc2dcb8111d6aceade740e3d90bde5
User & Date: js on 2023-11-02 22:48:20
Other Links: manifest | tags
Context
2023-11-02
23:00
OFMatrix4x4: Minor cleanups check-in: b5c3a36731 user: js tags: trunk
22:48
OFMatrix4x4: Move __asm__ out of loop check-in: 0d671245d4 user: js tags: trunk
2023-11-01
21:22
OFMatrix4x4: Use __asm__ __volatile__ check-in: f949f7775b user: js tags: trunk
Changes

Modified src/OFMatrix4x4.m from [b42a0641ea] to [a12ee064b0].

34
35
36
37
38
39
40

41
42
43
44
45



46
47
48
49
50
51
52
53
54
55















56
57
58
59
60
61
62
63
64
65
66
67
68
69

70
71
72
73
74



75
76
77
78
79
80
81
82
83
84
85















86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#  pragma GCC push_options
#  pragma GCC target("3dnow")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
    OFMatrix4x4 *matrix)
{

	float result[4][4];

	for (uint_fast8_t i = 0; i < 4; i++) {
		for (uint_fast8_t j = 0; j < 4; j++) {
			__asm__ __volatile__ (



			    "movd	(%2), %%mm0\n\t"
			    "punpckldq	16(%2), %%mm0\n\t"
			    "pfmul	(%1), %%mm0\n\t"
			    "movd	32(%2), %%mm1\n\t"
			    "punpckldq	48(%2), %%mm1\n\t"
			    "pfmul	8(%1), %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "pswapd	%%mm0, %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movd	%%mm0, %0"















			    :: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
			       "r"(&self->_values[0][j])
			    : "mm0", "mm1", "memory"
			);
		}
	}

	__asm__ __volatile__ ("femms");

	memcpy(self->_values, result, sizeof(result));
}
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{

	float result[4][4];

	for (uint_fast8_t i = 0; i < 4; i++) {
		for (uint_fast8_t j = 0; j < 4; j++) {
			__asm__ __volatile__ (



			    "movd	(%2), %%mm0\n\t"
			    "punpckldq	16(%2), %%mm0\n\t"
			    "pfmul	(%1), %%mm0\n\t"
			    "movd	32(%2), %%mm1\n\t"
			    "punpckldq	48(%2), %%mm1\n\t"
			    "pfmul	8(%1), %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movq	%%mm0, %%mm1\n\t"
			    "psrlq	$32, %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movd	%%mm0, %0"















			    :: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
			       "r"(&self->_values[0][j])
			    : "mm0", "mm1", "memory"
			);
		}
	}

	__asm__ __volatile__ ("femms");

	memcpy(self->_values, result, sizeof(result));
}

static void
transformVectors_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)







>
|

<
<
|
>
>
>
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
|
|
<
<
<
<






>
|

<
<
|
>
>
>
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
|
|
<
<
<
<







34
35
36
37
38
39
40
41
42
43


44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

74
75




76
77
78
79
80
81
82
83
84


85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

116
117




118
119
120
121
122
123
124
#  pragma GCC push_options
#  pragma GCC target("3dnow")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
    OFMatrix4x4 *matrix)
{
	float *left = &matrix->_values[0][0], *right = &self->_values[0][0];
	float result[4][4], *resultPtr = &result[0][0];



	__asm__ __volatile__ (
	    "xorw	%%cx, %%cx\n"
	    "\n\t"
	    "0:\n\t"
	    "movd	(%2), %%mm0\n\t"
	    "punpckldq	16(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	32(%2), %%mm1\n\t"
	    "punpckldq  48(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "pswapd	%%mm0, %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "movd	%%mm0, (%0)\n"
	    "\n\t"
	    "add	$4, %0\n\t"
	    "add	$4, %2\n\t"
	    "incb	%%cl\n\t"
	    "cmpb	$4, %%cl\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "sub	$16, %2\n\t"
	    "xorb	%%cl, %%cl\n\t"
	    "incb	%%ch\n\t"
	    "cmpb	$4, %%ch\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "femms"
	    : "+r"(resultPtr), "+r"(left), "+r"(right)

	    :: "cx", "mm0", "mm1", "memory"
	);





	memcpy(self->_values, result, sizeof(result));
}
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
	float *left = &matrix->_values[0][0], *right = &self->_values[0][0];
	float result[4][4], *resultPtr = &result[0][0];



	__asm__ __volatile__ (
	    "xorw	%%cx, %%cx\n"
	    "\n\t"
	    "0:\n\t"
	    "movd	(%2), %%mm0\n\t"
	    "punpckldq	16(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	32(%2), %%mm1\n\t"
	    "punpckldq  48(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "movq	%%mm0, %%mm1\n\t"
	    "psrlq	$32, %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "movd	%%mm0, (%0)\n"
	    "\n\t"
	    "add	$4, %0\n\t"
	    "add	$4, %2\n\t"
	    "incb	%%cl\n\t"
	    "cmpb	$4, %%cl\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "sub	$16, %2\n\t"
	    "xorb	%%cl, %%cl\n\t"
	    "incb	%%ch\n\t"
	    "cmpb	$4, %%ch\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "femms"
	    : "+r"(resultPtr), "+r"(left), "+r"(right)

	    :: "cx", "mm0", "mm1", "memory"
	);





	memcpy(self->_values, result, sizeof(result));
}

static void
transformVectors_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
	    "0:\n\t"
	    "femms"
	    : "+r"(count), "+r"(vectors)
	    : "r"(&self->_values)
	    : "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
	);
}

# ifndef __clang__
#  pragma GCC pop_options
# endif

+ (void)initialize
{
	const char *typeEncoding;







<







237
238
239
240
241
242
243

244
245
246
247
248
249
250
	    "0:\n\t"
	    "femms"
	    : "+r"(count), "+r"(vectors)
	    : "r"(&self->_values)
	    : "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
	);
}

# ifndef __clang__
#  pragma GCC pop_options
# endif

+ (void)initialize
{
	const char *typeEncoding;