Artifact 0c5b86a89ae81ebfc14c8fb6b1932d46a1d390ca7cc510418d8a85fe09842abb:
- File
src/OFMatrix4x4.m
— part of check-in
[1af54eb2c2]
at
2024-04-02 03:21:12
on branch trunk
— Only align OFVector4D where necessary
Changing the alignment of OFVector4D globally (as was done previously)
would have technically been an ABI break in extremely rare cases.
However, since we only need the alignment for the methods added after
1.0, it's better to only have the alignment there and get back to full
ABI compatibility with 1.0. (user: js, size: 10345) [annotate] [blame] [check-ins using]
/* * Copyright (c) 2008-2024 Jonathan Schleifer <js@nil.im> * * All rights reserved. * * This file is part of ObjFW. It may be distributed under the terms of the * Q Public License 1.0, which can be found in the file LICENSE.QPL included in * the packaging of this file. * * Alternatively, it may be distributed under the terms of the GNU General * Public License, either version 2 or 3, which can be found in the file * LICENSE.GPLv2 or LICENSE.GPLv3 respectively included in the packaging of this * file. */ #include "config.h" #import "OFMatrix4x4.h" #import "OFString.h" #import "OFSystemInfo.h" #import "OFOnce.h" static const float identityValues[4][4] = { { 1, 0, 0, 0 }, { 0, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 } }; @implementation OFMatrix4x4 #if (defined(OF_AMD64) || defined(OF_X86)) && defined(__GNUC__) # ifndef __clang__ # pragma GCC push_options # pragma GCC target("sse") # endif static void transformVectors_SSE(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { OF_ALIGN(16) float tmp[4]; __asm__ __volatile__ ( "test %[count], %[count]\n\t" "jz 0f\n" "\n\t" "movaps (%[matrix]), %%xmm0\n\t" "movaps 16(%[matrix]), %%xmm1\n\t" "movaps 32(%[matrix]), %%xmm2\n\t" # ifdef OF_AMD64 "movaps 48(%[matrix]), %%xmm8\n" # endif "\n\t" "0:\n\t" "movaps (%[vectors]), %%xmm3\n" "\n\t" "movaps %%xmm0, %%xmm4\n\t" "mulps %%xmm3, %%xmm4\n\t" "movaps %%xmm4, (%[tmp])\n\t" "addss 4(%[tmp]), %%xmm4\n\t" "addss 8(%[tmp]), %%xmm4\n\t" "addss 12(%[tmp]), %%xmm4\n" "\n\t" "movaps %%xmm1, %%xmm5\n\t" "mulps %%xmm3, %%xmm5\n\t" "movaps %%xmm5, (%[tmp])\n\t" "addss 4(%[tmp]), %%xmm5\n\t" "addss 8(%[tmp]), %%xmm5\n\t" "addss 12(%[tmp]), %%xmm5\n" "\n\t" "movaps %%xmm2, %%xmm6\n\t" "mulps %%xmm3, %%xmm6\n\t" "movaps %%xmm6, (%[tmp])\n\t" "addss 4(%[tmp]), %%xmm6\n\t" "addss 8(%[tmp]), %%xmm6\n\t" "addss 12(%[tmp]), %%xmm6\n" "\n\t" # ifdef OF_AMD64 "movaps %%xmm8, %%xmm7\n\t" # else "movaps 48(%[matrix]), %%xmm7\n\t" # endif "mulps %%xmm3, %%xmm7\n\t" "movaps %%xmm7, (%[tmp])\n\t" "addss 4(%[tmp]), %%xmm7\n\t" "addss 8(%[tmp]), %%xmm7\n\t" "addss 12(%[tmp]), %%xmm7\n" "\n\t" "movss %%xmm4, (%[vectors])\n\t" "movss %%xmm5, 4(%[vectors])\n\t" "movss %%xmm6, 8(%[vectors])\n\t" "movss %%xmm7, 12(%[vectors])\n" "\n\t" "add $16, %[vectors]\n\t" "dec %[count]\n\t" "jnz 0b\n" : [count] "+r" (count), [vectors] "+r" (vectors) : [matrix] "r" (self->_values), [tmp] "r" (&tmp) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", # ifdef OF_AMD64 "xmm8", # endif "memory" ); } # ifndef __clang__ # pragma GCC pop_options # endif # ifndef __clang__ # pragma GCC push_options # pragma GCC target("3dnow") # endif static void multiplyWithMatrix_3DNow(OFMatrix4x4 *self, SEL _cmd, OFMatrix4x4 *matrix) { float (*left)[4] = matrix->_values, (*right)[4] = self->_values; float result[4][4], (*resultPtr)[4] = result; __asm__ __volatile__ ( "movl $4, %%ecx\n\t" "\n\t" "0:\n\t" "movd (%[right]), %%mm0\n\t" "punpckldq 16(%[right]), %%mm0\n\t" "pfmul (%[left]), %%mm0\n\t" "movd 32(%[right]), %%mm1\n\t" "punpckldq 48(%[right]), %%mm1\n\t" "pfmul 8(%[left]), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, (%[result])\n\t" "movd 4(%[right]), %%mm0\n\t" "punpckldq 20(%[right]), %%mm0\n\t" "pfmul (%[left]), %%mm0\n\t" "movd 36(%[right]), %%mm1\n\t" "punpckldq 52(%[right]), %%mm1\n\t" "pfmul 8(%[left]), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 4(%[result])\n\t" "movd 8(%[right]), %%mm0\n\t" "punpckldq 24(%[right]), %%mm0\n\t" "pfmul (%[left]), %%mm0\n\t" "movd 40(%[right]), %%mm1\n\t" "punpckldq 56(%[right]), %%mm1\n\t" "pfmul 8(%[left]), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 8(%[result])\n\t" "movd 12(%[right]), %%mm0\n\t" "punpckldq 28(%[right]), %%mm0\n\t" "pfmul (%[left]), %%mm0\n\t" "movd 44(%[right]), %%mm1\n\t" "punpckldq 60(%[right]), %%mm1\n\t" "pfmul 8(%[left]), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "movd %%mm0, 12(%[result])\n" "\n\t" "add $16, %[result]\n\t" "add $16, %[left]\n\t" "decl %%ecx\n\t" "jnz 0b\n" "\n\t" "femms" : [result] "+r" (resultPtr), [left] "+r" (left), [right] "+r" (right) : : "ecx", "mm0", "mm1", "memory" ); memcpy(self->_values, result, 16 * sizeof(float)); } static void transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors, size_t count) { __asm__ __volatile__ ( "test %[count], %[count]\n\t" "jz 0f\n" "\n\t" "0:\n\t" "movq (%[vectors]), %%mm0\n\t" "movq 8(%[vectors]), %%mm1\n" "\n\t" "movq %%mm0, %%mm2\n\t" "movq %%mm1, %%mm3\n\t" "pfmul (%[matrix]), %%mm2\n\t" "pfmul 8(%[matrix]), %%mm3\n\t" "pfacc %%mm3, %%mm2\n\t" "pfacc %%mm2, %%mm2\n\t" "\n\t" "movq %%mm0, %%mm3\n\t" "movq %%mm1, %%mm4\n\t" "pfmul 16(%[matrix]), %%mm3\n\t" "pfmul 24(%[matrix]), %%mm4\n\t" "pfacc %%mm4, %%mm3\n\t" "pfacc %%mm3, %%mm3\n\t" "\n\t" "punpckldq %%mm3, %%mm2\n\t" "movq %%mm2, (%[vectors])\n" "\n\t" "movq %%mm0, %%mm2\n\t" "movq %%mm1, %%mm3\n\t" "pfmul 32(%[matrix]), %%mm2\n\t" "pfmul 40(%[matrix]), %%mm3\n\t" "pfacc %%mm3, %%mm2\n\t" "pfacc %%mm2, %%mm2\n\t" "\n\t" "pfmul 48(%[matrix]), %%mm0\n\t" "pfmul 56(%[matrix]), %%mm1\n\t" "pfacc %%mm1, %%mm0\n\t" "pfacc %%mm0, %%mm0\n\t" "\n\t" "punpckldq %%mm0, %%mm2\n\t" "movq %%mm2, 8(%[vectors])\n" "\n\t" "add $16, %[vectors]\n\t" "dec %[count]\n\t" "jnz 0b\n" "\n\t" "0:\n\t" "femms" : [count] "+r" (count), [vectors] "+r" (vectors) : [matrix] "r" (self->_values) : "mm0", "mm1", "mm2", "mm3", "mm4", "memory" ); } # ifndef __clang__ # pragma GCC pop_options # endif + (void)initialize { const char *typeEncoding; if (self != [OFMatrix4x4 class]) return; # define REPLACE(selector, func) \ typeEncoding = method_getTypeEncoding( \ class_getInstanceMethod(self, selector)); \ class_replaceMethod(self, selector, (IMP)func, typeEncoding); if ([OFSystemInfo supportsSSE]) { REPLACE(@selector(transformVectors:count:), transformVectors_SSE) } else if ([OFSystemInfo supports3DNow]) { REPLACE(@selector(multiplyWithMatrix:), multiplyWithMatrix_3DNow) REPLACE(@selector(transformVectors:count:), transformVectors_3DNow) } # undef REPLACE } #endif + (OFMatrix4x4 *)identityMatrix { return [[[OFMatrix4x4 alloc] initWithValues: identityValues] autorelease]; } + (instancetype)matrixWithValues: (const float [4][4])values { return [[[self alloc] initWithValues: values] autorelease]; } - (instancetype)init { OF_INVALID_INIT_METHOD } - (instancetype)initWithValues: (const float [4][4])values { self = [super init]; memcpy(_values, values, 16 * sizeof(float)); return self; } - (float (*)[4])values { return _values; } - (instancetype)copy { return [[OFMatrix4x4 alloc] initWithValues: (const float (*)[4])_values]; } - (bool)isEqual: (OFMatrix4x4 *)matrix { if (![matrix isKindOfClass: [OFMatrix4x4 class]]) return false; return (memcmp(_values, matrix->_values, 16 * sizeof(float)) == 0); } - (unsigned long)hash { unsigned long hash; OFHashInit(&hash); for (uint_fast8_t i = 0; i < 4; i++) for (uint_fast8_t j = 0; j < 4; j++) OFHashAddHash(&hash, OFFloatToRawUInt32(_values[i][j])); OFHashFinalize(&hash); return hash; } - (void)multiplyWithMatrix: (OFMatrix4x4 *)matrix { float result[4][4]; for (uint_fast8_t i = 0; i < 4; i++) for (uint_fast8_t j = 0; j < 4; j++) result[i][j] = matrix->_values[i][0] * _values[0][j] + matrix->_values[i][1] * _values[1][j] + matrix->_values[i][2] * _values[2][j] + matrix->_values[i][3] * _values[3][j]; memcpy(_values, result, 16 * sizeof(float)); } - (void)translateWithVector: (OFVector3D)vector { OFMatrix4x4 *translation = [[OFMatrix4x4 alloc] initWithValues: (const float [4][4]){ { 1, 0, 0, vector.x }, { 0, 1, 0, vector.y }, { 0, 0, 1, vector.z }, { 0, 0, 0, 1 } }]; [self multiplyWithMatrix: translation]; [translation release]; } - (void)scaleWithVector: (OFVector3D)vector { OFMatrix4x4 *scale = [[OFMatrix4x4 alloc] initWithValues: (const float [4][4]){ { vector.x, 0, 0, 0 }, { 0, vector.y, 0, 0 }, { 0, 0, vector.z, 0 }, { 0, 0, 0, 1 } }]; [self multiplyWithMatrix: scale]; [scale release]; } - (OFVector4D)transformedVector: (OFVector4D)vector { OF_ALIGN(16) OFVector4D copy = vector; [self transformVectors: © count: 1]; return copy; } - (void)transformVectors: (OFVector4D *)vectors count: (size_t)count { for (size_t i = 0; i < count; i++) { OFVector4D vector = vectors[i]; vectors[i].x = _values[0][0] * vector.x + _values[0][1] * vector.y + _values[0][2] * vector.z + _values[0][3] * vector.w; vectors[i].y = _values[1][0] * vector.x + _values[1][1] * vector.y + _values[1][2] * vector.z + _values[1][3] * vector.w; vectors[i].z = _values[2][0] * vector.x + _values[2][1] * vector.y + _values[2][2] * vector.z + _values[2][3] * vector.w; vectors[i].w = _values[3][0] * vector.x + _values[3][1] * vector.y + _values[3][2] * vector.z + _values[3][3] * vector.w; } } - (OFString *)description { return [OFString stringWithFormat: @"<OFMatrix4x4: {\n" @"\t%g %g %g %g\n" @"\t%g %g %g %g\n" @"\t%g %g %g %g\n" @"\t%g %g %g %g\n" @"}>", _values[0][0], _values[0][1], _values[0][2], _values[0][3], _values[1][0], _values[1][1], _values[1][2], _values[1][3], _values[2][0], _values[2][1], _values[2][2], _values[2][3], _values[3][0], _values[3][1], _values[3][2], _values[3][3]]; } @end