ObjFW  Diff

Differences From Artifact [a4f9a5f812]:

To Artifact [c5d68e8b03]:


37
38
39
40
41
42
43
44

45
46
47
48
49



50
51

52
53
54
55

56
57
58
59
60
61
62




63
64
65
66
67
68
69




70
71
72
73
74
75
76




77
78
79
80
81

82
83
84
85
86
87




88
89
90
91
92




93
94
95


96

97
98



99
100
101
102
103
104
105
37
38
39
40
41
42
43

44
45
46



47
48
49
50

51
52
53
54

55
56
57
58




59
60
61
62
63
64
65




66
67
68
69
70
71
72




73
74
75
76
77
78
79
80

81
82
83




84
85
86
87
88




89
90
91
92
93


94
95
96
97


98
99
100
101
102
103
104
105
106
107







-
+


-
-
-
+
+
+

-
+



-
+



-
-
-
-
+
+
+
+



-
-
-
-
+
+
+
+



-
-
-
-
+
+
+
+




-
+


-
-
-
-
+
+
+
+

-
-
-
-
+
+
+
+

-
-
+
+

+
-
-
+
+
+







static void
transformVectors_SSE(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{
	OF_ALIGN(16) float tmp[4];

	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "test	%[count], %[count]\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "movaps	(%2), %%xmm0\n\t"
	    "movaps	16(%2), %%xmm1\n\t"
	    "movaps	32(%2), %%xmm2\n\t"
	    "movaps	(%[matrix]), %%xmm0\n\t"
	    "movaps	16(%[matrix]), %%xmm1\n\t"
	    "movaps	32(%[matrix]), %%xmm2\n\t"
# ifdef OF_AMD64
	    "movaps	48(%2), %%xmm8\n"
	    "movaps	48(%[matrix]), %%xmm8\n"
# endif
	    "\n\t"
	    "0:\n\t"
	    "movaps	(%1), %%xmm3\n"
	    "movaps	(%[vectors]), %%xmm3\n"
	    "\n\t"
	    "movaps	%%xmm0, %%xmm4\n\t"
	    "mulps	%%xmm3, %%xmm4\n\t"
	    "movaps	%%xmm4, (%3)\n\t"
	    "addss	4(%3), %%xmm4\n\t"
	    "addss	8(%3), %%xmm4\n\t"
	    "addss	12(%3), %%xmm4\n"
	    "movaps	%%xmm4, (%[tmp])\n\t"
	    "addss	4(%[tmp]), %%xmm4\n\t"
	    "addss	8(%[tmp]), %%xmm4\n\t"
	    "addss	12(%[tmp]), %%xmm4\n"
	    "\n\t"
	    "movaps	%%xmm1, %%xmm5\n\t"
	    "mulps	%%xmm3, %%xmm5\n\t"
	    "movaps	%%xmm5, (%3)\n\t"
	    "addss	4(%3), %%xmm5\n\t"
	    "addss	8(%3), %%xmm5\n\t"
	    "addss	12(%3), %%xmm5\n"
	    "movaps	%%xmm5, (%[tmp])\n\t"
	    "addss	4(%[tmp]), %%xmm5\n\t"
	    "addss	8(%[tmp]), %%xmm5\n\t"
	    "addss	12(%[tmp]), %%xmm5\n"
	    "\n\t"
	    "movaps	%%xmm2, %%xmm6\n\t"
	    "mulps	%%xmm3, %%xmm6\n\t"
	    "movaps	%%xmm6, (%3)\n\t"
	    "addss	4(%3), %%xmm6\n\t"
	    "addss	8(%3), %%xmm6\n\t"
	    "addss	12(%3), %%xmm6\n"
	    "movaps	%%xmm6, (%[tmp])\n\t"
	    "addss	4(%[tmp]), %%xmm6\n\t"
	    "addss	8(%[tmp]), %%xmm6\n\t"
	    "addss	12(%[tmp]), %%xmm6\n"
	    "\n\t"
# ifdef OF_AMD64
	    "movaps	%%xmm8, %%xmm7\n\t"
# else
	    "movaps	48(%2), %%xmm7\n\t"
	    "movaps	48(%[matrix]), %%xmm7\n\t"
# endif
	    "mulps	%%xmm3, %%xmm7\n\t"
	    "movaps	%%xmm7, (%3)\n\t"
	    "addss	4(%3), %%xmm7\n\t"
	    "addss	8(%3), %%xmm7\n\t"
	    "addss	12(%3), %%xmm7\n"
	    "movaps	%%xmm7, (%[tmp])\n\t"
	    "addss	4(%[tmp]), %%xmm7\n\t"
	    "addss	8(%[tmp]), %%xmm7\n\t"
	    "addss	12(%[tmp]), %%xmm7\n"
	    "\n\t"
	    "movss	%%xmm4, (%1)\n\t"
	    "movss	%%xmm5, 4(%1)\n\t"
	    "movss	%%xmm6, 8(%1)\n\t"
	    "movss	%%xmm7, 12(%1)\n"
	    "movss	%%xmm4, (%[vectors])\n\t"
	    "movss	%%xmm5, 4(%[vectors])\n\t"
	    "movss	%%xmm6, 8(%[vectors])\n\t"
	    "movss	%%xmm7, 12(%[vectors])\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "add	$16, %[vectors]\n\t"
	    "dec	%[count]\n\t"
	    "jnz	0b\n"
	    : [count] "+r" (count),
	    : "+r"(count), "+r"(vectors)
	    : "r"(self->_values), "r"(&tmp)
	      [vectors] "+r" (vectors)
	    : [matrix] "r" (self->_values),
	      [tmp] "r" (&tmp)
	    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
# ifdef OF_AMD64
	      "xmm8",
# endif
	      "memory"
	);
}
117
118
119
120
121
122
123
124
125
126
127
128
129






130
131
132
133
134
135
136
137
138







139
140
141
142
143
144
145
146
147







148
149
150
151
152
153
154
155
156







157
158
159

160
161
162


163
164
165
166
167
168





169
170
171
172
173
174
175
176
177
178
179

180
181
182
183
184


185
186
187
188
189


190
191
192
193
194
195
196


197
198
199
200
201

202
203
204
205
206


207
208
209
210
211


212
213
214
215
216

217
218
219


220
221
222
223

224
225


226
227
228
229
230
231
232
119
120
121
122
123
124
125






126
127
128
129
130
131
132
133







134
135
136
137
138
139
140
141
142







143
144
145
146
147
148
149
150
151







152
153
154
155
156
157
158
159
160

161
162


163
164
165
166
167
168


169
170
171
172
173
174
175
176
177
178
179
180
181
182
183

184
185
186
187


188
189
190
191
192


193
194
195
196
197
198
199


200
201
202
203
204
205

206
207
208
209


210
211
212
213
214


215
216
217
218
219
220

221
222


223
224
225
226
227
228
229


230
231
232
233
234
235
236
237
238







-
-
-
-
-
-
+
+
+
+
+
+


-
-
-
-
-
-
-
+
+
+
+
+
+
+


-
-
-
-
-
-
-
+
+
+
+
+
+
+


-
-
-
-
-
-
-
+
+
+
+
+
+
+


-
+

-
-
+
+




-
-
+
+
+
+
+










-
+



-
-
+
+



-
-
+
+





-
-
+
+




-
+



-
-
+
+



-
-
+
+




-
+

-
-
+
+




+
-
-
+
+







	float (*left)[4] = matrix->_values, (*right)[4] = self->_values;
	float result[4][4], (*resultPtr)[4] = result;

	__asm__ __volatile__ (
	    "movl	$4, %%ecx\n\t"
	    "\n\t"
	    "0:\n\t"
	    "movd	(%2), %%mm0\n\t"
	    "punpckldq	16(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	32(%2), %%mm1\n\t"
	    "punpckldq  48(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "movd	(%[right]), %%mm0\n\t"
	    "punpckldq	16(%[right]), %%mm0\n\t"
	    "pfmul	(%[left]), %%mm0\n\t"
	    "movd	32(%[right]), %%mm1\n\t"
	    "punpckldq  48(%[right]), %%mm1\n\t"
	    "pfmul	8(%[left]), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, (%0)\n\t"
	    "movd	4(%2), %%mm0\n\t"
	    "punpckldq	20(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	36(%2), %%mm1\n\t"
	    "punpckldq  52(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "movd	%%mm0, (%[result])\n\t"
	    "movd	4(%[right]), %%mm0\n\t"
	    "punpckldq	20(%[right]), %%mm0\n\t"
	    "pfmul	(%[left]), %%mm0\n\t"
	    "movd	36(%[right]), %%mm1\n\t"
	    "punpckldq  52(%[right]), %%mm1\n\t"
	    "pfmul	8(%[left]), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 4(%0)\n\t"
	    "movd	8(%2), %%mm0\n\t"
	    "punpckldq	24(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	40(%2), %%mm1\n\t"
	    "punpckldq  56(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "movd	%%mm0, 4(%[result])\n\t"
	    "movd	8(%[right]), %%mm0\n\t"
	    "punpckldq	24(%[right]), %%mm0\n\t"
	    "pfmul	(%[left]), %%mm0\n\t"
	    "movd	40(%[right]), %%mm1\n\t"
	    "punpckldq  56(%[right]), %%mm1\n\t"
	    "pfmul	8(%[left]), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 8(%0)\n\t"
	    "movd	12(%2), %%mm0\n\t"
	    "punpckldq	28(%2), %%mm0\n\t"
	    "pfmul	(%1), %%mm0\n\t"
	    "movd	44(%2), %%mm1\n\t"
	    "punpckldq  60(%2), %%mm1\n\t"
	    "pfmul	8(%1), %%mm1\n\t"
	    "movd	%%mm0, 8(%[result])\n\t"
	    "movd	12(%[right]), %%mm0\n\t"
	    "punpckldq	28(%[right]), %%mm0\n\t"
	    "pfmul	(%[left]), %%mm0\n\t"
	    "movd	44(%[right]), %%mm1\n\t"
	    "punpckldq  60(%[right]), %%mm1\n\t"
	    "pfmul	8(%[left]), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "movd	%%mm0, 12(%0)\n"
	    "movd	%%mm0, 12(%[result])\n"
	    "\n\t"
	    "add	$16, %0\n\t"
	    "add	$16, %1\n\t"
	    "add	$16, %[result]\n\t"
	    "add	$16, %[left]\n\t"
	    "decl	%%ecx\n\t"
	    "jnz	0b\n"
	    "\n\t"
	    "femms"
	    : "+r"(resultPtr), "+r"(left), "+r"(right)
	    :: "ecx", "mm0", "mm1", "memory"
	    : [result] "+r" (resultPtr),
	      [left] "+r" (left),
	      [right] "+r" (right)
	    :
	    : "ecx", "mm0", "mm1", "memory"
	);

	memcpy(self->_values, result, 16 * sizeof(float));
}

static void
transformVectors_3DNow(OFMatrix4x4 *self, SEL _cmd, OFVector4D *vectors,
    size_t count)
{
	__asm__ __volatile__ (
	    "test	%0, %0\n\t"
	    "test	%[count], %[count]\n\t"
	    "jz		0f\n"
	    "\n\t"
	    "0:\n\t"
	    "movq	(%1), %%mm0\n\t"
	    "movq	8(%1), %%mm1\n"
	    "movq	(%[vectors]), %%mm0\n\t"
	    "movq	8(%[vectors]), %%mm1\n"
	    "\n\t"
	    "movq	%%mm0, %%mm2\n\t"
	    "movq	%%mm1, %%mm3\n\t"
	    "pfmul	(%2), %%mm2\n\t"
	    "pfmul	8(%2), %%mm3\n\t"
	    "pfmul	(%[matrix]), %%mm2\n\t"
	    "pfmul	8(%[matrix]), %%mm3\n\t"
	    "pfacc	%%mm3, %%mm2\n\t"
	    "pfacc	%%mm2, %%mm2\n\t"
	    "\n\t"
	    "movq	%%mm0, %%mm3\n\t"
	    "movq	%%mm1, %%mm4\n\t"
	    "pfmul	16(%2), %%mm3\n\t"
	    "pfmul	24(%2), %%mm4\n\t"
	    "pfmul	16(%[matrix]), %%mm3\n\t"
	    "pfmul	24(%[matrix]), %%mm4\n\t"
	    "pfacc	%%mm4, %%mm3\n\t"
	    "pfacc	%%mm3, %%mm3\n\t"
	    "\n\t"
	    "punpckldq	%%mm3, %%mm2\n\t"
	    "movq	%%mm2, (%1)\n"
	    "movq	%%mm2, (%[vectors])\n"
	    "\n\t"
	    "movq	%%mm0, %%mm2\n\t"
	    "movq	%%mm1, %%mm3\n\t"
	    "pfmul	32(%2), %%mm2\n\t"
	    "pfmul	40(%2), %%mm3\n\t"
	    "pfmul	32(%[matrix]), %%mm2\n\t"
	    "pfmul	40(%[matrix]), %%mm3\n\t"
	    "pfacc	%%mm3, %%mm2\n\t"
	    "pfacc	%%mm2, %%mm2\n\t"
	    "\n\t"
	    "pfmul	48(%2), %%mm0\n\t"
	    "pfmul	56(%2), %%mm1\n\t"
	    "pfmul	48(%[matrix]), %%mm0\n\t"
	    "pfmul	56(%[matrix]), %%mm1\n\t"
	    "pfacc	%%mm1, %%mm0\n\t"
	    "pfacc	%%mm0, %%mm0\n\t"
	    "\n\t"
	    "punpckldq	%%mm0, %%mm2\n\t"
	    "movq	%%mm2, 8(%1)\n"
	    "movq	%%mm2, 8(%[vectors])\n"
	    "\n\t"
	    "add	$16, %1\n\t"
	    "dec	%0\n\t"
	    "add	$16, %[vectors]\n\t"
	    "dec	%[count]\n\t"
	    "jnz	0b\n"
	    "\n\t"
	    "0:\n\t"
	    "femms"
	    : [count] "+r" (count),
	    : "+r"(count), "+r"(vectors)
	    : "r"(self->_values)
	      [vectors] "+r" (vectors)
	    : [matrix] "r" (self->_values)
	    : "mm0", "mm1", "mm2", "mm3", "mm4", "memory"
	);
}
# ifndef __clang__
#  pragma GCC pop_options
# endif