ObjFW  Check-in [d9af65de97]

Overview
Comment:Don't use -masm=intel

It's broken in older versions of Clang (e.g. Clang 12).

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: d9af65de971c5ae1e4544916cc11e6045a7fcd2f30d0226bc37ef37fb2b14ac6
User & Date: js on 2023-10-31 20:25:57
Other Links: manifest | tags
Context
2023-10-31
20:27
OFMatrix4x4: Partially unroll multiplication loop check-in: d53c87e7bb user: js tags: trunk
20:25
Don't use -masm=intel check-in: d9af65de97 user: js tags: trunk
2023-10-30
23:58
OFMatrix4x4: Use 3DNow! to transform vectors check-in: 1ac0583aae user: js tags: trunk
Changes

Modified configure.ac from [ac89f801b1] to [3d6a6f352e].

330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
AX_CHECK_COMPILER_FLAGS(-fno-common, [OBJCFLAGS="$OBJCFLAGS -fno-common"])
AX_CHECK_COMPILER_FLAGS(-Xclang -fno-constant-cfstrings, [
	flag="-Xclang -fno-constant-cfstrings"
	OBJCFLAGS="$OBJCFLAGS $flag"
	OBJFW_OBJCFLAGS="$OBJFW_OBJCFLAGS $flag"
])

case "$host_cpu" in
i*86|x86_64)
	AX_CHECK_COMPILER_FLAGS([-masm=intel], [
		OBJCFLAGS="$OBJCFLAGS -masm=intel"
		AC_DEFINE(HAVE_INTEL_SYNTAX, 1, [Whether asm syntax is Intel])
	])
	;;
esac

AX_CHECK_COMPILER_FLAGS([-Wsign-compare -Werror],
	[OBJCFLAGS="$OBJCFLAGS -Wsign-compare"])
AS_IF([test x"$with_nds" != x"yes"], [
	AX_CHECK_COMPILER_FLAGS([-Wshadow -Werror],
		[OBJCFLAGS="$OBJCFLAGS -Wshadow"])
])
AX_CHECK_COMPILER_FLAGS([-Wshorten-64-to-32 -Werror],







<
<
<
<
<
<
<
<
<







330
331
332
333
334
335
336









337
338
339
340
341
342
343
AX_CHECK_COMPILER_FLAGS(-fno-common, [OBJCFLAGS="$OBJCFLAGS -fno-common"])
AX_CHECK_COMPILER_FLAGS(-Xclang -fno-constant-cfstrings, [
	flag="-Xclang -fno-constant-cfstrings"
	OBJCFLAGS="$OBJCFLAGS $flag"
	OBJFW_OBJCFLAGS="$OBJFW_OBJCFLAGS $flag"
])










AX_CHECK_COMPILER_FLAGS([-Wsign-compare -Werror],
	[OBJCFLAGS="$OBJCFLAGS -Wsign-compare"])
AS_IF([test x"$with_nds" != x"yes"], [
	AX_CHECK_COMPILER_FLAGS([-Wshadow -Werror],
		[OBJCFLAGS="$OBJCFLAGS -Wshadow"])
])
AX_CHECK_COMPILER_FLAGS([-Wshorten-64-to-32 -Werror],

Modified src/OFMatrix4x4.m from [9f0079818c] to [c5f86994f5].

25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
	{ 1, 0, 0, 0 },
	{ 0, 1, 0, 0 },
	{ 0, 0, 1, 0 },
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(HAVE_INTEL_SYNTAX)
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
	float result[4][4] = {{ 0 }};

	for (uint_fast8_t i = 0; i < 4; i++) {
		for (uint_fast8_t j = 0; j < 4; j++) {
			__asm__ (
			    "movd	mm0, [%2]\n\t"
			    "punpckldq	mm0, [%2 + 16]\n\t"
			    "pfmul	mm0, [%1]\n\t"
			    "movd	mm1, [%2 + 32]\n\t"
			    "punpckldq	mm1, [%2 + 48]\n\t"
			    "pfmul	mm1, [%1 + 8]\n\t"
			    "pfadd	mm0, mm1\n\t"
			    "movq	mm1, mm0\n\t"
			    "psrlq	mm1, 32\n\t"
			    "pfadd	mm0, mm1\n\t"
			    "movd	%0, mm0"
			    :: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
			       "r"(&self->_values[0][j])
			    : "mm0", "mm1", "memory"
			);
		}
	}

	__asm__ ("femms");

	memcpy(self->_values, result, sizeof(result));
}

static OFVector4D
transformedVector_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D vector)
{
	OFVector4D result;

	__asm__ (
	    "movq	mm0, [%2]\n\t"
	    "movq	mm1, [%2 + 8]\n"
	    "\n\t"
	    "movq	mm2, mm0\n\t"
	    "movq	mm3, mm1\n\t"
	    "pfmul	mm2, [%1]\n\t"
	    "pfmul	mm3, [%1 + 8]\n\t"
	    "pfadd	mm2, mm3\n\t"
	    "movq	mm3, mm2\n\t"
	    "psrlq	mm3, 32\n\t"
	    "pfadd	mm2, mm3\n"
	    "\n\t"
	    "movq	mm3, mm0\n\t"
	    "movq	mm4, mm1\n\t"
	    "pfmul	mm3, [%1 + 16]\n\t"
	    "pfmul	mm4, [%1 + 24]\n\t"
	    "pfadd	mm3, mm4\n\t"
	    "movq	mm4, mm3\n\t"
	    "psrlq	mm4, 32\n\t"
	    "pfadd	mm3, mm4\n"
	    "\n\t"
	    "punpckldq	mm2, mm3\n\t"
	    "movq	[%0], mm2\n"
	    "\n\t"
	    "movq	mm2, mm0\n\t"
	    "movq	mm3, mm1\n\t"
	    "pfmul	mm2, [%1 + 32]\n\t"
	    "pfmul	mm3, [%1 + 40]\n\t"
	    "pfadd	mm2, mm3\n\t"
	    "movq	mm3, mm2\n\t"
	    "psrlq	mm3, 32\n\t"
	    "pfadd	mm2, mm3\n"
	    "\n\t"
	    "pfmul	mm0, [%1 + 48]\n\t"
	    "pfmul	mm1, [%1 + 56]\n\t"
	    "pfadd	mm0, mm1\n\t"
	    "movq	mm1, mm0\n\t"
	    "psrlq	mm1, 32\n\t"
	    "pfadd	mm0, mm1\n"
	    "\n\t"
	    "punpckldq	mm2, mm0\n\t"
	    "movq	[%0 + 8], mm2\n"
	    "\n\t"
	    "femms"
	    :: "r"(&result), "r"(&self->_values), "r"(&vector)
	    : "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
	);

	return result;







|



|




|
|
|
|
|
|
|
|
|
|
|


















|
|

|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|

|
|

|
|
|
|
|
|
|
|

|
|
|
|
|
|

|
|







25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
	{ 1, 0, 0, 0 },
	{ 0, 1, 0, 0 },
	{ 0, 0, 1, 0 },
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if defined(OF_AMD64) || defined(OF_X86)
static void
multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix)
{
	float result[4][4];

	for (uint_fast8_t i = 0; i < 4; i++) {
		for (uint_fast8_t j = 0; j < 4; j++) {
			__asm__ (
			    "movd	(%2), %%mm0\n\t"
			    "punpckldq	16(%2), %%mm0\n\t"
			    "pfmul	(%1), %%mm0\n\t"
			    "movd	32(%2), %%mm1\n\t"
			    "punpckldq	48(%2), %%mm1\n\t"
			    "pfmul	8(%1), %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movq	%%mm0, %%mm1\n\t"
			    "psrlq	$32, %%mm1\n\t"
			    "pfadd	%%mm1, %%mm0\n\t"
			    "movd	%%mm0, %0"
			    :: "m"(result[i][j]), "r"(&matrix->_values[i][0]),
			       "r"(&self->_values[0][j])
			    : "mm0", "mm1", "memory"
			);
		}
	}

	__asm__ ("femms");

	memcpy(self->_values, result, sizeof(result));
}

static OFVector4D
transformedVector_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D vector)
{
	OFVector4D result;

	__asm__ (
	    "movq	(%2), %%mm0\n\t"
	    "movq	8(%2), %%mm1\n"
	    "\n\t"
	    "movq	%%mm0, %%mm2\n\t"
	    "movq	%%mm1, %%mm3\n\t"
	    "pfmul	(%1), %%mm2\n\t"
	    "pfmul	8(%1), %%mm3\n\t"
	    "pfadd	%%mm3, %%mm2\n\t"
	    "movq	%%mm2, %%mm3\n\t"
	    "psrlq	$32, %%mm3\n\t"
	    "pfadd	%%mm3, %%mm2\n"
	    "\n\t"
	    "movq	%%mm0, %%mm3\n\t"
	    "movq	%%mm1, %%mm4\n\t"
	    "pfmul	16(%1), %%mm3\n\t"
	    "pfmul	24(%1), %%mm4\n\t"
	    "pfadd	%%mm4, %%mm3\n\t"
	    "movq	%%mm3, %%mm4\n\t"
	    "psrlq	$32, %%mm4\n\t"
	    "pfadd	%%mm4, %%mm3\n"
	    "\n\t"
	    "punpckldq	%%mm3, %%mm2\n\t"
	    "movq	%%mm2, (%0)\n"
	    "\n\t"
	    "movq	%%mm0, %%mm2\n\t"
	    "movq	%%mm1, %%mm3\n\t"
	    "pfmul	32(%1), %%mm2\n\t"
	    "pfmul	40(%1), %%mm3\n\t"
	    "pfadd	%%mm3, %%mm2\n\t"
	    "movq	%%mm2, %%mm3\n\t"
	    "psrlq	$32, %%mm3\n\t"
	    "pfadd	%%mm3, %%mm2\n"
	    "\n\t"
	    "pfmul	48(%1), %%mm0\n\t"
	    "pfmul	56(%1), %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n\t"
	    "movq	%%mm0, %%mm1\n\t"
	    "psrlq	$32, %%mm1\n\t"
	    "pfadd	%%mm1, %%mm0\n"
	    "\n\t"
	    "punpckldq	%%mm0, %%mm2\n\t"
	    "movq	%%mm2, 8(%0)\n"
	    "\n\t"
	    "femms"
	    :: "r"(&result), "r"(&self->_values), "r"(&vector)
	    : "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
	);

	return result;

Modified src/OFSystemInfo.m from [3c4e0fe6ea] to [1b9d36f2fa].

294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
	/*
	 * This workaround is required by older GCC versions when using -fPIC,
	 * as ebx is a special register in PIC code. Yes, GCC is indeed not
	 * able to just push a register onto the stack before the __asm__ block
	 * and to pop it afterwards.
	 */
	__asm__ (
	    "xchg{l}	{ %%ebx, %%edi | edi, ebx }\n\t"
	    "cpuid\n\t"
	    "xchg{l}	{ %%edi, %%ebx | ebx, edi }"
	    : "=a"(regs.eax), "=D"(regs.ebx), "=c"(regs.ecx), "=d"(regs.edx)
	    : "a"(eax), "c"(ecx)
	);
# else
	memset(&regs, 0, sizeof(regs));
# endif








|

|







294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
	/*
	 * This workaround is required by older GCC versions when using -fPIC,
	 * as ebx is a special register in PIC code. Yes, GCC is indeed not
	 * able to just push a register onto the stack before the __asm__ block
	 * and to pop it afterwards.
	 */
	__asm__ (
	    "xchgl	%%ebx, %%edi\n\t"
	    "cpuid\n\t"
	    "xchgl	%%edi, %%ebx"
	    : "=a"(regs.eax), "=D"(regs.ebx), "=c"(regs.ecx), "=d"(regs.edx)
	    : "a"(eax), "c"(ecx)
	);
# else
	memset(&regs, 0, sizeof(regs));
# endif