ObjFW  Check-in [7f304f573b]

Overview
Comment:OFMatrix4x4: SSE4.1 for -[transformVectors:count:]

This requires the vectors to be 16 byte aligned. In order to achieve
this, the OFVector4D type is changed to have an alignment of 16 bytes.
However, this does *not* break ABI because the only method actually
requiring 16 byte alignment is -[transformVectors:count:], which was not
in ObjFW 1.0. Hence binaries compiled for ObjFW 1.0 have no 16 byte
alignment for OFVector4D, but also cannot ever call into any code that
needs it. (-[transformedVector:] calls into -[transformVectors:count:],
but creates a properly aligned copy that it passes.)

Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 7f304f573b03eccbb08908fdebe6ec34b84119849b151c81bcfa12fcb1ee3134
User & Date: js on 2023-11-03 00:16:18
Other Links: manifest | tags
Context
2023-11-03
01:07
OFMatrix4x4: Remove SSE4.1 due to Clang bugs check-in: 7e1dbda4b4 user: js tags: trunk
00:16
OFMatrix4x4: SSE4.1 for -[transformVectors:count:] check-in: 7f304f573b user: js tags: trunk
2023-11-02
23:00
OFMatrix4x4: Minor cleanups check-in: b5c3a36731 user: js tags: trunk
Changes

Modified src/OFMatrix4x4.h from [5f3106a61c] to [12afeb3f7a].

19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

/**
 * @brief A 4x4 matrix of floats.
 */
OF_SUBCLASSING_RESTRICTED
@interface OFMatrix4x4: OFObject <OFCopying>
{
	float _values[4][4];
}

#ifdef OF_HAVE_CLASS_PROPERTIES
@property (readonly, class) OFMatrix4x4 *identityMatrix;
#endif

/**







|







19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

/**
 * @brief A 4x4 matrix of floats.
 */
OF_SUBCLASSING_RESTRICTED
@interface OFMatrix4x4: OFObject <OFCopying>
{
	float _values[4][4] OF_ALIGN(16);
}

#ifdef OF_HAVE_CLASS_PROPERTIES
@property (readonly, class) OFMatrix4x4 *identityMatrix;
#endif

/**

Modified src/OFMatrix4x4.m from [740fcdd16f] to [46f1ac4383].

26
27
28
29
30
31
32












































33
34
35
36
37
38
39
	{ 0, 1, 0, 0 },
	{ 0, 0, 1, 0 },
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)












































# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("3dnow,3dnowa")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
    OFMatrix4x4 *matrix)







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
	{ 0, 1, 0, 0 },
	{ 0, 0, 1, 0 },
	{ 0, 0, 0, 1 }
};

@implementation OFMatrix4x4
#if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__)
# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("sse4.1")
# endif
static void
transformVectors_SSE41(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{
	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	48(%2), %%xmm3\n"
	    "\n\t"
	    "0:\n\t"
	    "movaps	(%1), %%xmm4\n\t"
	    "movaps	%%xmm4, %%xmm5\n\t"
	    "dpps	$0xFF, %%xmm0, %%xmm4\n\t"
	    "movaps	%%xmm5, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm1, %%xmm5\n\t"
	    "movaps	%%xmm6, %%xmm7\n\t"
	    "dpps	$0xFF, %%xmm2, %%xmm6\n\t"
	    "dpps	$0xFF, %%xmm3, %%xmm7\n\t"
	    "insertps	$0x10, %%xmm5, %%xmm4\n\t"
	    "insertps	$0x20, %%xmm6, %%xmm4\n\t"
	    "insertps	$0x30, %%xmm7, %%xmm4\n\t"
	    "movaps	%%xmm4, (%1)\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "jnz	0b\n"
	    : "+r"(count), "+r"(vectors)
	    : "r"(&self->_values)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	      "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif

# ifndef __clang__
#  pragma GCC push_options
#  pragma GCC target("3dnow,3dnowa")
# endif
static void
multiplyWithMatrix_enhanced3DNow(OFMatrix4x4 *self, SEL _cmd,
    OFMatrix4x4 *matrix)
262
263
264
265
266
267
268
269



270
271
272
273
274
275
276
		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supports3DNow]) {



		if ([OFSystemInfo supportsEnhanced3DNow]) {
			REPLACE(@selector(multiplyWithMatrix:),
			    multiplyWithMatrix_enhanced3DNow)
			REPLACE(@selector(transformVectors:count:),
			    transformVectors_enhanced3DNow)
		} else {
			REPLACE(@selector(multiplyWithMatrix:),







|
>
>
>







306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
		return;

# define REPLACE(selector, func)					\
	typeEncoding = method_getTypeEncoding(				\
	    class_getInstanceMethod(self, selector));			\
	class_replaceMethod(self, selector, (IMP)func, typeEncoding);

	if ([OFSystemInfo supportsSSE41]) {
		REPLACE(@selector(transformVectors:count:),
		    transformVectors_SSE41)
	} else if ([OFSystemInfo supports3DNow]) {
		if ([OFSystemInfo supportsEnhanced3DNow]) {
			REPLACE(@selector(multiplyWithMatrix:),
			    multiplyWithMatrix_enhanced3DNow)
			REPLACE(@selector(transformVectors:count:),
			    transformVectors_enhanced3DNow)
		} else {
			REPLACE(@selector(multiplyWithMatrix:),

Modified src/OFObject.h from [6d396e3db6] to [ea05d80823].

345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
}

/**
 * @struct OFVector4D OFObject.h ObjFW/OFObject.h
 *
 * @brief A vector in 4D space.
 */
typedef struct OF_BOXABLE OFVector4D {
	/** The x coordinate of the vector */
	float x;
	/** The y coordinate of the vector */
	float y;
	/** The z coordinate of the vector */
	float z;
	/** The w coordinate of the vector */







|







345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
}

/**
 * @struct OFVector4D OFObject.h ObjFW/OFObject.h
 *
 * @brief A vector in 4D space.
 */
typedef struct OF_BOXABLE OF_ALIGN(16) OFVector4D {
	/** The x coordinate of the vector */
	float x;
	/** The y coordinate of the vector */
	float y;
	/** The z coordinate of the vector */
	float z;
	/** The w coordinate of the vector */

Modified src/macros.h from [d9f6837658] to [c4d1f32dd0].

97
98
99
100
101
102
103




104
105
106
107
108
109
110
111
112
113
114
115
116
117
# define OF_INLINE inline
# define OF_LIKELY(cond) (cond)
# define OF_UNLIKELY(cond) (cond)
# define OF_CONST_FUNC
# define OF_NO_RETURN_FUNC
# define OF_WEAK_REF(sym)
#endif





#if __STDC_VERSION__ >= 201112L
# define OF_ALIGNOF(type) _Alignof(type)
# define OF_ALIGNAS(type) _Alignas(type)
#else
# define OF_ALIGNOF(type) __alignof__(type)
# define OF_ALIGNAS(type) __attribute__((__aligned__(__alignof__(type))))
#endif

#if __STDC_VERSION__ >= 201112L && defined(OF_HAVE_MAX_ALIGN_T)
# define OF_BIGGEST_ALIGNMENT _Alignof(max_align_t)
#else
# ifdef __BIGGEST_ALIGNMENT__
#  define OF_BIGGEST_ALIGNMENT __BIGGEST_ALIGNMENT__







>
>
>
>






|







97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# define OF_INLINE inline
# define OF_LIKELY(cond) (cond)
# define OF_UNLIKELY(cond) (cond)
# define OF_CONST_FUNC
# define OF_NO_RETURN_FUNC
# define OF_WEAK_REF(sym)
#endif

#ifdef __GNUC__
# define OF_ALIGN(alignment) __attribute__((__aligned__(alignment)))
#endif

#if __STDC_VERSION__ >= 201112L
# define OF_ALIGNOF(type) _Alignof(type)
# define OF_ALIGNAS(type) _Alignas(type)
#else
# define OF_ALIGNOF(type) __alignof__(type)
# define OF_ALIGNAS(type) OF_ALIGN(OF_ALIGNOF(type))
#endif

#if __STDC_VERSION__ >= 201112L && defined(OF_HAVE_MAX_ALIGN_T)
# define OF_BIGGEST_ALIGNMENT _Alignof(max_align_t)
#else
# ifdef __BIGGEST_ALIGNMENT__
#  define OF_BIGGEST_ALIGNMENT __BIGGEST_ALIGNMENT__