ObjFW  Check-in [7e1dbda4b4]

Overview
Comment:OFMatrix4x4: Remove SSE4.1 due to Clang bugs

Clang fails to correctly align the _values ivar in 32 bit mode, despite
OF_ALIGN(16). Trying to force it to align it properly by creating a new
typedef or putting it into an aligned struct instead makes Clang's
builtin memcpy fail in another, entirely unrelated method (that has no
inline assembly), as the inlined memcpy will then try to use movaps on
unaligned data, probably because Clang still assumes the data to be
unaligned there and tries to memcpy using SSE after adding an offset
that would make it aligned - but now makes it unaligned.

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 7e1dbda4b4998e1308ffc28183f5a351d209c0fb703f82c4df42f9c74f3170ba
User & Date: js on 2023-11-03 01:07:38
Other Links: manifest | tags
Context
2023-11-04
13:10
OFMatrix4x4: Restore SSE4.1 code check-in: 0eb97e4612 user: js tags: trunk
13:06
Work around Clang not aligning ivars correctly check-in: 055e14fc75 user: js tags: trunk
2023-11-03
01:07
OFMatrix4x4: Remove SSE4.1 due to Clang bugs check-in: 7e1dbda4b4 user: js tags: trunk
00:16
OFMatrix4x4: SSE4.1 for -[transformVectors:count:] check-in: 7f304f573b user: js tags: trunk
Changes

Modified src/OFMatrix4x4.m from [46f1ac4383] to [740fcdd16f].

26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
	{ 0, 1, 0, 0 },
	{ 0, 0, 1, 0 },
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("sse4.1")
# endif
static void
transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{
	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	48(%2), %%xmm3\n"
	    "\n\t"
	    "0:\n\t"
	    "movaps	(%1), %%xmm4\n\t"
	    "movaps	%%xmm4, %%xmm5\n\t"
	    "dpps	$0xFF, %%xmm0, %%xmm4\n\t"
	    "movaps	%%xmm5, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm1, %%xmm5\n\t"
	    "movaps	%%xmm6, %%xmm7\n\t"
	    "dpps	$0xFF, %%xmm2, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm3, %%xmm7\n\t"
	    "insertps	$0x10, %%xmm5, %%xmm4\n\t"
	    "insertps	$0x20, %%xmm6, %%xmm4\n\t"
	    "insertps	$0x30, %%xmm7, %%xmm4\n\t"
	    "movaps	%%xmm4, (%1)\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "jnz	0b\n"
	    : "+r"(count), "+r"(vectors)
	    : "r"(&self->_values)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	      "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif

# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("3dnow,3dnowa")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
    OFMatrix4x4 *matrix)







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







26
27
28
29
30
31
32












































33
34
35
36
37
38
39
	{ 0, 1, 0, 0 },
	{ 0, 0, 1, 0 },
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)












































# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("3dnow,3dnowa")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
    OFMatrix4x4 *matrix)
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supportsSSE41]) {
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_SSE41)
	} else if ([OFSystemInfo supports3DNow]) {
		if ([OFSystemInfo supportsEnhanced3DNow]) {
			REPLACE(@selector(multiplyWithMatrix:),
			    multiplyWithMatrix_enhanced3DNow)
			REPLACE(@selector(transformVectors:count:),
			    transformVectors_enhanced3DNow)
		} else {
			REPLACE(@selector(multiplyWithMatrix:),







|
<
<
<







262
263
264
265
266
267
268
269



270
271
272
273
274
275
276
		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supports3DNow]) {



		if ([OFSystemInfo supportsEnhanced3DNow]) {
			REPLACE(@selector(multiplyWithMatrix:),
			    multiplyWithMatrix_enhanced3DNow)
			REPLACE(@selector(transformVectors:count:),
			    transformVectors_enhanced3DNow)
		} else {
			REPLACE(@selector(multiplyWithMatrix:),

Modified src/OFObject.h from [ea05d80823] to [53e8f943e6].

345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
}

/**
 * @struct OFVector4D OFObject.h ObjFW/OFObject.h
 *
 * @brief A vector in 4D space.
 */
typedef struct OF_BOXABLE OF_ALIGN(16) OFVector4D {
	/** The x coordinate of the vector */
	float x;
	/** The y coordinate of the vector */
	float y;
	/** The z coordinate of the vector */
	float z;
	/** The w coordinate of the vector */







|







345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
}

/**
 * @struct OFVector4D OFObject.h ObjFW/OFObject.h
 *
 * @brief A vector in 4D space.
 */
typedef struct OF_ALIGN(16) OF_BOXABLE OFVector4D {
	/** The x coordinate of the vector */
	float x;
	/** The y coordinate of the vector */
	float y;
	/** The z coordinate of the vector */
	float z;
	/** The w coordinate of the vector */