ObjFW  Check-in [0d671245d4]

Overview
Comment:OFMatrix4x4: Move __asm__ out of loop
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 0d671245d4bb5953939c96cc4ffe77d989bc2dcb8111d6aceade740e3d90bde5
User & Date: js on 2023-11-02 22:48:20
Other Links: manifest | tags
Context
2023-11-02
23:00
OFMatrix4x4: Minor cleanups check-in: b5c3a36731 user: js tags: trunk
22:48
OFMatrix4x4: Move __asm__ out of loop check-in: 0d671245d4 user: js tags: trunk
2023-11-01
21:22
OFMatrix4x4: Use __asm__ __volatile__ check-in: f949f7775b user: js tags: trunk
Changes

Modified src/OFMatrix4x4.m from [b42a0641ea] to [a12ee064b0].

34
35
36
37
38
39
40

41

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56






























57
58
59


60
61
62
63
64
65
66
67
68
69

70

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86































87
88
89


90
91
92
93
94
95
96
97
98
99
100
34
35
36
37
38
39
40
41

42
43














44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73



74
75




76
77
78
79
80
81
82

83
84















85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115



116
117




118
119
120
121
122
123
124







+
-
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
-
-
-
-






+
-
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
-
-
-
-







#  pragma GCC push_options
#  pragma GCC target("3dnow")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
    OFMatrix4x4 *matrix)
{
	float *left = &matrix->_values[0][0], *right = &self->_values[0][0];
	float result[4][4];
	float result[4][4], *resultPtr = &result[0][0];

	for (uint_fast8_t i = 0; i < 4; i++) {
		for (uint_fast8_t j = 0; j < 4; j++) {
			__asm__ __volatile__ (
			    "movd	(%2), %%mm0\n\t"
			    "punpckldq	16(%2), %%mm0\n\t"
			    "pfmul	(%1), %%mm0\n\t"
			    "movd	32(%2), %%mm1\n\t"
			    "punpckldq	48(%2), %%mm1\n\t"
			    "pfmul	8(%1), %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "pswapd	%%mm0, %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movd	%%mm0, %0"
			    :: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
	__asm__ __volatile__ (
	    "xorw	%%cx, %%cx\n"
	    "\n\t"
	    "0:\n\t"
	    "movd	(%2), %%mm0\n\t"
	    "punpckldq	16(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	32(%2), %%mm1\n\t"
	    "punpckldq  48(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "pswapd	%%mm0, %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "movd	%%mm0, (%0)\n"
	    "\n\t"
	    "add	$4, %0\n\t"
	    "add	$4, %2\n\t"
	    "incb	%%cl\n\t"
	    "cmpb	$4, %%cl\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "sub	$16, %2\n\t"
	    "xorb	%%cl, %%cl\n\t"
	    "incb	%%ch\n\t"
	    "cmpb	$4, %%ch\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "femms"
	    : "+r"(resultPtr), "+r"(left), "+r"(right)
			       "r"(&self->_values[0][j])
			    : "mm0", "mm1", "memory"
			);
	    :: "cx", "mm0", "mm1", "memory"
	);
		}
	}

	__asm__ __volatile__ ("femms");

	memcpy(self->_values, result, sizeof(result));
}
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
	float *left = &matrix->_values[0][0], *right = &self->_values[0][0];
	float result[4][4];
	float result[4][4], *resultPtr = &result[0][0];

	for (uint_fast8_t i = 0; i < 4; i++) {
		for (uint_fast8_t j = 0; j < 4; j++) {
			__asm__ __volatile__ (
			    "movd	(%2), %%mm0\n\t"
			    "punpckldq	16(%2), %%mm0\n\t"
			    "pfmul	(%1), %%mm0\n\t"
			    "movd	32(%2), %%mm1\n\t"
			    "punpckldq	48(%2), %%mm1\n\t"
			    "pfmul	8(%1), %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movq	%%mm0, %%mm1\n\t"
			    "psrlq	$32, %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movd	%%mm0, %0"
			    :: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
	__asm__ __volatile__ (
	    "xorw	%%cx, %%cx\n"
	    "\n\t"
	    "0:\n\t"
	    "movd	(%2), %%mm0\n\t"
	    "punpckldq	16(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	32(%2), %%mm1\n\t"
	    "punpckldq  48(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "movq	%%mm0, %%mm1\n\t"
	    "psrlq	$32, %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "movd	%%mm0, (%0)\n"
	    "\n\t"
	    "add	$4, %0\n\t"
	    "add	$4, %2\n\t"
	    "incb	%%cl\n\t"
	    "cmpb	$4, %%cl\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "sub	$16, %2\n\t"
	    "xorb	%%cl, %%cl\n\t"
	    "incb	%%ch\n\t"
	    "cmpb	$4, %%ch\n\t"
	    "jb		0b\n"
	    "\n\t"
	    "femms"
	    : "+r"(resultPtr), "+r"(left), "+r"(right)
			       "r"(&self->_values[0][j])
			    : "mm0", "mm1", "memory"
			);
	    :: "cx", "mm0", "mm1", "memory"
	);
		}
	}

	__asm__ __volatile__ ("femms");

	memcpy(self->_values, result, sizeof(result));
}

static void
transformVectors_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
237
238
239
240
241
242
243

244
245
246
247
248
249
250







-







	    "0:\n\t"
	    "femms"
	    : "+r"(count), "+r"(vectors)
	    : "r"(&self->_values)
	    : "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
	);
}

# ifndef __clang__
#  pragma GCC pop_options
# endif

+ (void)initialize
{
	const char *typeEncoding;