ObjFW  Check-in [7f304f573b]

Overview
Comment:OFMatrix4x4: SSE4.1 for -[transformVectors:count:]

This requires the vectors to be 16 byte aligned. In order to achieve
this, the OFVector4D type is changed to have an alignment of 16 bytes.
However, this does *not* break ABI because the only method actually
requiring 16 byte alignment is -[transformVectors:count:], which was not
in ObjFW 1.0. Hence binaries compiled for ObjFW 1.0 have no 16 byte
alignment for OFVector4D, but also cannot ever call into any code that
needs it. (-[transformedVector:] calls into -[transformVectors:count:],
but creates a properly aligned copy that it passes.)

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 7f304f573b03eccbb08908fdebe6ec34b84119849b151c81bcfa12fcb1ee3134
User & Date: js on 2023-11-03 00:16:18
Other Links: manifest | tags
Context
2023-11-03
01:07
OFMatrix4x4: Remove SSE4.1 due to Clang bugs check-in: 7e1dbda4b4 user: js tags: trunk
00:16
OFMatrix4x4: SSE4.1 for -[transformVectors:count:] check-in: 7f304f573b user: js tags: trunk
2023-11-02
23:00
OFMatrix4x4: Minor cleanups check-in: b5c3a36731 user: js tags: trunk
Changes

Modified src/OFMatrix4x4.h from [5f3106a61c] to [12afeb3f7a].

19
20
21
22
23
24
25
26

27
28
29
30
31
32
33
19
20
21
22
23
24
25

26
27
28
29
30
31
32
33







-
+








/**
 * @brief A 4x4 matrix of floats.
 */
OF_SUBCLASSING_RESTRICTED
@interface OFMatrix4x4: OFObject <OFCopying>
{
	float _values[4][4];
	float _values[4][4] OF_ALIGN(16);
}

#ifdef OF_HAVE_CLASS_PROPERTIES
@property (readonly, class) OFMatrix4x4 *identityMatrix;
#endif

/**

Modified src/OFMatrix4x4.m from [740fcdd16f] to [46f1ac4383].

26
27
28
29
30
31
32












































33
34
35
36
37
38
39
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







	{ 0, 1, 0, 0 },
	{ 0, 0, 1, 0 },
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("sse4.1")
# endif
static void
transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{
	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	48(%2), %%xmm3\n"
	    "\n\t"
	    "0:\n\t"
	    "movaps	(%1), %%xmm4\n\t"
	    "movaps	%%xmm4, %%xmm5\n\t"
	    "dpps	$0xFF, %%xmm0, %%xmm4\n\t"
	    "movaps	%%xmm5, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm1, %%xmm5\n\t"
	    "movaps	%%xmm6, %%xmm7\n\t"
	    "dpps	$0xFF, %%xmm2, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm3, %%xmm7\n\t"
	    "insertps	$0x10, %%xmm5, %%xmm4\n\t"
	    "insertps	$0x20, %%xmm6, %%xmm4\n\t"
	    "insertps	$0x30, %%xmm7, %%xmm4\n\t"
	    "movaps	%%xmm4, (%1)\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "jnz	0b\n"
	    : "+r"(count), "+r"(vectors)
	    : "r"(&self->_values)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	      "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif

# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("3dnow,3dnowa")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
    OFMatrix4x4 *matrix)
262
263
264
265
266
267
268
269




270
271
272
273
274
275
276
306
307
308
309
310
311
312

313
314
315
316
317
318
319
320
321
322
323







-
+
+
+
+







		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supports3DNow]) {
	if ([OFSystemInfo supportsSSE41]) {
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_SSE41)
	} else if ([OFSystemInfo supports3DNow]) {
		if ([OFSystemInfo supportsEnhanced3DNow]) {
			REPLACE(@selector(multiplyWithMatrix:),
			    multiplyWithMatrix_enhanced3DNow)
			REPLACE(@selector(transformVectors:count:),
			    transformVectors_enhanced3DNow)
		} else {
			REPLACE(@selector(multiplyWithMatrix:),

Modified src/OFObject.h from [6d396e3db6] to [ea05d80823].

345
346
347
348
349
350
351
352

353
354
355
356
357
358
359
345
346
347
348
349
350
351

352
353
354
355
356
357
358
359







-
+







}

/**
 * @struct OFVector4D OFObject.h ObjFW/OFObject.h
 *
 * @brief A vector in 4D space.
 */
typedef struct OF_BOXABLE OFVector4D {
typedef struct OF_BOXABLE OF_ALIGN(16) OFVector4D {
	/** The x coordinate of the vector */
	float x;
	/** The y coordinate of the vector */
	float y;
	/** The z coordinate of the vector */
	float z;
	/** The w coordinate of the vector */

Modified src/macros.h from [d9f6837658] to [c4d1f32dd0].

97
98
99
100
101
102
103




104
105
106
107
108
109
110

111
112
113
114
115
116
117
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

114
115
116
117
118
119
120
121







+
+
+
+






-
+







# define OF_INLINE inline
# define OF_LIKELY(cond) (cond)
# define OF_UNLIKELY(cond) (cond)
# define OF_CONST_FUNC
# define OF_NO_RETURN_FUNC
# define OF_WEAK_REF(sym)
#endif

#ifdef __GNUC__
# define OF_ALIGN(alignment) __attribute__((__aligned__(alignment)))
#endif

#if __STDC_VERSION__ >= 201112L
# define OF_ALIGNOF(type) _Alignof(type)
# define OF_ALIGNAS(type) _Alignas(type)
#else
# define OF_ALIGNOF(type) __alignof__(type)
# define OF_ALIGNAS(type) __attribute__((__aligned__(__alignof__(type))))
# define OF_ALIGNAS(type) OF_ALIGN(OF_ALIGNOF(type))
#endif

#if __STDC_VERSION__ >= 201112L && defined(OF_HAVE_MAX_ALIGN_T)
# define OF_BIGGEST_ALIGNMENT _Alignof(max_align_t)
#else
# ifdef __BIGGEST_ALIGNMENT__
#  define OF_BIGGEST_ALIGNMENT __BIGGEST_ALIGNMENT__