ObjFW  Check-in [d9af65de97]

Overview
Comment:Don't use -masm=intel

It's broken in older versions of Clang (e.g. Clang 12).

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: d9af65de971c5ae1e4544916cc11e6045a7fcd2f30d0226bc37ef37fb2b14ac6
User & Date: js on 2023-10-31 20:25:57
Other Links: manifest | tags
Context
2023-10-31
20:27
OFMatrix4x4: Partially unroll multiplication loop check-in: d53c87e7bb user: js tags: trunk
20:25
Don't use -masm=intel check-in: d9af65de97 user: js tags: trunk
2023-10-30
23:58
OFMatrix4x4: Use 3DNow! to transform vectors check-in: 1ac0583aae user: js tags: trunk
Changes

Modified configure.ac from [ac89f801b1] to [3d6a6f352e].

330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
330
331
332
333
334
335
336









337
338
339
340
341
342
343







-
-
-
-
-
-
-
-
-







AX_CHECK_COMPILER_FLAGS(-fno-common, [OBJCFLAGS="$OBJCFLAGS -fno-common"])
AX_CHECK_COMPILER_FLAGS(-Xclang -fno-constant-cfstrings, [
	flag="-Xclang -fno-constant-cfstrings"
	OBJCFLAGS="$OBJCFLAGS $flag"
	OBJFW_OBJCFLAGS="$OBJFW_OBJCFLAGS $flag"
])

case "$host_cpu" in
i*86|x86_64)
	AX_CHECK_COMPILER_FLAGS([-masm=intel], [
		OBJCFLAGS="$OBJCFLAGS -masm=intel"
		AC_DEFINE(HAVE_INTEL_SYNTAX, 1, [Whether asm syntax is Intel])
	])
	;;
esac

AX_CHECK_COMPILER_FLAGS([-Wsign-compare -Werror],
	[OBJCFLAGS="$OBJCFLAGS -Wsign-compare"])
AS_IF([test x"$with_nds" != x"yes"], [
	AX_CHECK_COMPILER_FLAGS([-Wshadow -Werror],
		[OBJCFLAGS="$OBJCFLAGS -Wshadow"])
])
AX_CHECK_COMPILER_FLAGS([-Wshorten-64-to-32 -Werror],

Modified src/OFMatrix4x4.m from [9f0079818c] to [c5f86994f5].

25
26
27
28
29
30
31
32

33
34
35
36

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51











52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71


72
73
74
75
76
77
78
79
80








81
82
83
84
85
86
87
88
89








90
91
92


93
94
95
96
97
98
99
100
101








102
103
104
105
106
107
108






109
110
111


112
113
114
115
116
117
118
25
26
27
28
29
30
31

32
33
34
35

36
37
38
39
40











41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69


70
71
72








73
74
75
76
77
78
79
80
81








82
83
84
85
86
87
88
89
90


91
92
93








94
95
96
97
98
99
100
101
102






103
104
105
106
107
108
109


110
111
112
113
114
115
116
117
118







-
+



-
+




-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+


















-
-
+
+

-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+

-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+

-
-
+
+

-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+

-
-
-
-
-
-
+
+
+
+
+
+

-
-
+
+







	{ 1, 0, 0, 0 },
	{ 0, 1, 0, 0 },
	{ 0, 0, 1, 0 },
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(HAVE_INTEL_SYNTAX)
#if defined(OF_AMD64) || defined(OF_X86)
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
	float result[4][4] = {{ 0 }};
	float result[4][4];

	for (uint_fast8_t i = 0; i < 4; i++) {
		for (uint_fast8_t j = 0; j < 4; j++) {
			__asm__ (
			    "movd	mm0, [%2]\n\t"
			    "punpckldq	mm0, [%2 + 16]\n\t"
			    "pfmul	mm0, [%1]\n\t"
			    "movd	mm1, [%2 + 32]\n\t"
			    "punpckldq	mm1, [%2 + 48]\n\t"
			    "pfmul	mm1, [%1 + 8]\n\t"
			    "pfadd	mm0, mm1\n\t"
			    "movq	mm1, mm0\n\t"
			    "psrlq	mm1, 32\n\t"
			    "pfadd	mm0, mm1\n\t"
			    "movd	%0, mm0"
			    "movd	(%2), %%mm0\n\t"
			    "punpckldq	16(%2), %%mm0\n\t"
			    "pfmul	(%1), %%mm0\n\t"
			    "movd	32(%2), %%mm1\n\t"
			    "punpckldq	48(%2), %%mm1\n\t"
			    "pfmul	8(%1), %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movq	%%mm0, %%mm1\n\t"
			    "psrlq	$32, %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movd	%%mm0, %0"
			    :: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
			       "r"(&self->_values[0][j])
			    : "mm0", "mm1", "memory"
			);
		}
	}

	__asm__ ("femms");

	memcpy(self->_values, result, sizeof(result));
}

static OFVector4D
transformedVector_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D vector)
{
	OFVector4D result;

	__asm__ (
	    "movq	mm0, [%2]\n\t"
	    "movq	mm1, [%2 + 8]\n"
	    "movq	(%2), %%mm0\n\t"
	    "movq	8(%2), %%mm1\n"
	    "\n\t"
	    "movq	mm2, mm0\n\t"
	    "movq	mm3, mm1\n\t"
	    "pfmul	mm2, [%1]\n\t"
	    "pfmul	mm3, [%1 + 8]\n\t"
	    "pfadd	mm2, mm3\n\t"
	    "movq	mm3, mm2\n\t"
	    "psrlq	mm3, 32\n\t"
	    "pfadd	mm2, mm3\n"
	    "movq	%%mm0, %%mm2\n\t"
	    "movq	%%mm1, %%mm3\n\t"
	    "pfmul	(%1), %%mm2\n\t"
	    "pfmul	8(%1), %%mm3\n\t"
	    "pfadd	%%mm3, %%mm2\n\t"
	    "movq	%%mm2, %%mm3\n\t"
	    "psrlq	$32, %%mm3\n\t"
	    "pfadd	%%mm3, %%mm2\n"
	    "\n\t"
	    "movq	mm3, mm0\n\t"
	    "movq	mm4, mm1\n\t"
	    "pfmul	mm3, [%1 + 16]\n\t"
	    "pfmul	mm4, [%1 + 24]\n\t"
	    "pfadd	mm3, mm4\n\t"
	    "movq	mm4, mm3\n\t"
	    "psrlq	mm4, 32\n\t"
	    "pfadd	mm3, mm4\n"
	    "movq	%%mm0, %%mm3\n\t"
	    "movq	%%mm1, %%mm4\n\t"
	    "pfmul	16(%1), %%mm3\n\t"
	    "pfmul	24(%1), %%mm4\n\t"
	    "pfadd	%%mm4, %%mm3\n\t"
	    "movq	%%mm3, %%mm4\n\t"
	    "psrlq	$32, %%mm4\n\t"
	    "pfadd	%%mm4, %%mm3\n"
	    "\n\t"
	    "punpckldq	mm2, mm3\n\t"
	    "movq	[%0], mm2\n"
	    "punpckldq	%%mm3, %%mm2\n\t"
	    "movq	%%mm2, (%0)\n"
	    "\n\t"
	    "movq	mm2, mm0\n\t"
	    "movq	mm3, mm1\n\t"
	    "pfmul	mm2, [%1 + 32]\n\t"
	    "pfmul	mm3, [%1 + 40]\n\t"
	    "pfadd	mm2, mm3\n\t"
	    "movq	mm3, mm2\n\t"
	    "psrlq	mm3, 32\n\t"
	    "pfadd	mm2, mm3\n"
	    "movq	%%mm0, %%mm2\n\t"
	    "movq	%%mm1, %%mm3\n\t"
	    "pfmul	32(%1), %%mm2\n\t"
	    "pfmul	40(%1), %%mm3\n\t"
	    "pfadd	%%mm3, %%mm2\n\t"
	    "movq	%%mm2, %%mm3\n\t"
	    "psrlq	$32, %%mm3\n\t"
	    "pfadd	%%mm3, %%mm2\n"
	    "\n\t"
	    "pfmul	mm0, [%1 + 48]\n\t"
	    "pfmul	mm1, [%1 + 56]\n\t"
	    "pfadd	mm0, mm1\n\t"
	    "movq	mm1, mm0\n\t"
	    "psrlq	mm1, 32\n\t"
	    "pfadd	mm0, mm1\n"
	    "pfmul	48(%1), %%mm0\n\t"
	    "pfmul	56(%1), %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "movq	%%mm0, %%mm1\n\t"
	    "psrlq	$32, %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n"
	    "\n\t"
	    "punpckldq	mm2, mm0\n\t"
	    "movq	[%0 + 8], mm2\n"
	    "punpckldq	%%mm0, %%mm2\n\t"
	    "movq	%%mm2, 8(%0)\n"
	    "\n\t"
	    "femms"
	    :: "r"(&result), "r"(&self->_values), "r"(&vector)
	    : "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
	);

	return result;

Modified src/OFSystemInfo.m from [3c4e0fe6ea] to [1b9d36f2fa].

294
295
296
297
298
299
300
301

302
303

304
305
306
307
308
309
310
294
295
296
297
298
299
300

301
302

303
304
305
306
307
308
309
310







-
+

-
+







	/*
	 * This workaround is required by older GCC versions when using -fPIC,
	 * as ebx is a special register in PIC code. Yes, GCC is indeed not
	 * able to just push a register onto the stack before the __asm__ block
	 * and to pop it afterwards.
	 */
	__asm__ (
	    "xchg{l}	{ %%ebx, %%edi | edi, ebx }\n\t"
	    "xchgl	%%ebx, %%edi\n\t"
	    "cpuid\n\t"
	    "xchg{l}	{ %%edi, %%ebx | ebx, edi }"
	    "xchgl	%%edi, %%ebx"
	    : "=a"(regs.eax), "=D"(regs.ebx), "=c"(regs.ecx), "=d"(regs.edx)
	    : "a"(eax), "c"(ecx)
	);
# else
	memset(&regs, 0, sizeof(regs));
# endif