1
/*
2
 * Copyright (c) 2010, Christopher Friedt <chrisfriedt@gmail.com>
3
 *
4
 * This program is free software; you can redistribute it and/or modify
5
 * it under the terms of the GNU General Public License as published by
6
 * the Free Software Foundation; either version 2 of the License, or
7
 * (at your option) any later version.
8
 *
9
 * This program is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
 * GNU General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program; if not, write to the Free Software
16
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
 */
18
19
#include "cpy1d.h"
20
21
#ifdef HAVE_NEON_CPY1D
22
23
#define ALL_Q_REGS \
24
	"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \
25
	"q8", "q8", "q10", "q11", "q12", "q13", "q14", "q15"
26
#define EIGHT_Q_REGS \
27
	"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
28
#define FOUR_Q_REGS \
29
	"q0", "q1", "q2", "q3"
30
#define TWO_Q_REGS \
31
	"q0", "q1"
32
#define ONE_Q_REG \
33
	"q0"
34
35
static inline void cpy1d_neon_4(R *I, R *O, INT n0, INT is0, INT os0);
36
static inline void cpy1d_neon_2(R *I, R *O, INT n0, INT is0, INT os0);
37
static inline void cpy1d_neon_1(R *I, R *O, INT n0, INT is0, INT os0);
38
39
static void cpy1d_neon(R *I, R *O, INT n0, INT is0, INT os0, INT vl)
40
{
41
	INT i0, v;
42
43
	switch (vl) {
44
	case 1:
45
		if ((n0 & 1) || is0 != 1 || os0 != 1) {
46
			cpy1d_neon_1(I,O,n0,is0,os0);
47
			break;
48
		}
49
		n0 /= 2; is0 = 2; os0 = 2;
50
		/* fall through */
51
	case 2:
52
		if ((n0 & 1) || is0 != 2 || os0 != 2) {
53
			//cpy1d_neon_2(I,O,n0,is0,os0);
54
		   for (; n0 > 0; --n0, I += is0, O += os0) {
55
			R x0 = I[0];
56
			R x1 = I[1];
57
			O[0] = x0;
58
			O[1] = x1;
59
		   }
60
			break;
61
		}
62
		n0 /= 2; is0 = 4; os0 = 4;
63
		/* fall through */
64
	case 4:
65
		cpy1d_neon_4(I,O,n0,is0,os0);
66
		break;
67
	default:
68
		for (i0 = 0; i0 < n0; ++i0)
69
			for (v = 0; v < vl; ++v) {
70
				R x0 = I[i0 * is0 + v];
71
				O[i0 * os0 + v] = x0;
72
			}
73
		break;
74
	}
75
}
76
static inline void cpy1d_neon_4(R *I, R *O, INT n0, INT is0, INT os0) {
77
	typedef struct { char b[sizeof(float32x4_t)]; } f32x4_sz;
78
	enum { INPUT_ALIGNED = 2, OUTPUT_ALIGNED = 1 };
79
80
	volatile int chunksize = 0;
81
	volatile int al =
82
		((!((unsigned int)I % 16)&1) << 1) |
83
		 (!((unsigned int)I % 16)&1);
84
85
	volatile R *i = I, *o = O;
86
	// could be constant / immediate
87
	volatile is = is0 << 4; // sixteen bytes each
88
	volatile os = os0 << 4;
89
	volatile n = n0;
90
91
	//assert( !(n0 % 4) );
92
93
	for( ; n > 0; n -= chunksize ) {
94
		if ( n >= 16 ) {
95
			chunksize = 16;
96
			if ( al & INPUT_ALIGNED ) {
97
				// fast :)
98
				__asm__ __volatile__(
99
					"vld1.32 {q0},  [%0,:128], %1 \n\t"
100
					"vld1.32 {q1},  [%0,:128], %1 \n\t"
101
					"vld1.32 {q2},  [%0,:128], %1 \n\t"
102
					"vld1.32 {q3},  [%0,:128], %1 \n\t"
103
					"vld1.32 {q4},  [%0,:128], %1 \n\t"
104
					"vld1.32 {q5},  [%0,:128], %1 \n\t"
105
					"vld1.32 {q6},  [%0,:128], %1 \n\t"
106
					"vld1.32 {q7},  [%0,:128], %1 \n\t"
107
					"vld1.32 {q8},  [%0,:128], %1 \n\t"
108
					"vld1.32 {q9},  [%0,:128], %1 \n\t"
109
					"vld1.32 {q10}, [%0,:128], %1 \n\t"
110
					"vld1.32 {q11}, [%0,:128], %1 \n\t"
111
					"vld1.32 {q12}, [%0,:128], %1 \n\t"
112
					"vld1.32 {q13}, [%0,:128], %1 \n\t"
113
					"vld1.32 {q14}, [%0,:128], %1 \n\t"
114
					"vld1.32 {q15}, [%0,:128], %1 \n"
115
					: "+r"(i)
116
					: "r" (is),
117
					  "m" (*((f32x4_sz *)(i+ 0*is0))),
118
					  "m" (*((f32x4_sz *)(i+ 1*is0))),
119
					  "m" (*((f32x4_sz *)(i+ 2*is0))),
120
					  "m" (*((f32x4_sz *)(i+ 3*is0))),
121
					  "m" (*((f32x4_sz *)(i+ 4*is0))),
122
					  "m" (*((f32x4_sz *)(i+ 5*is0))),
123
					  "m" (*((f32x4_sz *)(i+ 6*is0))),
124
					  "m" (*((f32x4_sz *)(i+ 7*is0))),
125
					  "m" (*((f32x4_sz *)(i+ 8*is0))),
126
					  "m" (*((f32x4_sz *)(i+ 9*is0))),
127
					  "m" (*((f32x4_sz *)(i+10*is0))),
128
					  "m" (*((f32x4_sz *)(i+11*is0))),
129
					  "m" (*((f32x4_sz *)(i+12*is0))),
130
					  "m" (*((f32x4_sz *)(i+13*is0))),
131
					  "m" (*((f32x4_sz *)(i+14*is0))),
132
					  "m" (*((f32x4_sz *)(i+15*is0))),
133
					: ALL_Q_REGS
134
				);
135
			} else {
136
					// slow :(
137
				__asm__ __volatile__(
138
					"vld1.32 {q0},  [%0], %1 \n\t"
139
					"vld1.32 {q1},  [%0], %1 \n\t"
140
					"vld1.32 {q2},  [%0], %1 \n\t"
141
					"vld1.32 {q3},  [%0], %1 \n\t"
142
					"vld1.32 {q4},  [%0], %1 \n\t"
143
					"vld1.32 {q5},  [%0], %1 \n\t"
144
					"vld1.32 {q6},  [%0], %1 \n\t"
145
					"vld1.32 {q7},  [%0], %1 \n\t"
146
					"vld1.32 {q8},  [%0], %1 \n\t"
147
					"vld1.32 {q9},  [%0], %1 \n\t"
148
					"vld1.32 {q10}, [%0], %1 \n\t"
149
					"vld1.32 {q11}, [%0], %1 \n\t"
150
					"vld1.32 {q12}, [%0], %1 \n\t"
151
					"vld1.32 {q13}, [%0], %1 \n\t"
152
					"vld1.32 {q14}, [%0], %1 \n\t"
153
					"vld1.32 {q15}, [%0], %1 \n"
154
					: "+r"(i)
155
					: "r" (is),
156
					  "m" (*((f32x4_sz *)(i+ 0*is0))),
157
					  "m" (*((f32x4_sz *)(i+ 1*is0))),
158
					  "m" (*((f32x4_sz *)(i+ 2*is0))),
159
					  "m" (*((f32x4_sz *)(i+ 3*is0))),
160
					  "m" (*((f32x4_sz *)(i+ 4*is0))),
161
					  "m" (*((f32x4_sz *)(i+ 5*is0))),
162
					  "m" (*((f32x4_sz *)(i+ 6*is0))),
163
					  "m" (*((f32x4_sz *)(i+ 7*is0))),
164
					  "m" (*((f32x4_sz *)(i+ 8*is0))),
165
					  "m" (*((f32x4_sz *)(i+ 9*is0))),
166
					  "m" (*((f32x4_sz *)(i+10*is0))),
167
					  "m" (*((f32x4_sz *)(i+11*is0))),
168
					  "m" (*((f32x4_sz *)(i+12*is0))),
169
					  "m" (*((f32x4_sz *)(i+13*is0))),
170
					  "m" (*((f32x4_sz *)(i+14*is0))),
171
					  "m" (*((f32x4_sz *)(i+15*is0))),
172
					: ALL_Q_REGS
173
				);
174
			}
175
			if ( al & OUTPUT_ALIGNED ) {
176
				// fast :)
177
				__asm__ __volatile__(
178
					"vst1.32 {q0},  [%0,:128], %17 \n\t"
179
					"vst1.32 {q1},  [%0,:128], %17 \n\t"
180
					"vst1.32 {q2},  [%0,:128], %17 \n\t"
181
					"vst1.32 {q3},  [%0,:128], %17 \n\t"
182
					"vst1.32 {q4},  [%0,:128], %17 \n\t"
183
					"vst1.32 {q5},  [%0,:128], %17 \n\t"
184
					"vst1.32 {q6},  [%0,:128], %17 \n\t"
185
					"vst1.32 {q7},  [%0,:128], %17 \n\t"
186
					"vst1.32 {q8},  [%0,:128], %17 \n\t"
187
					"vst1.32 {q9},  [%0,:128], %17 \n\t"
188
					"vst1.32 {q10}, [%0,:128], %17 \n\t"
189
					"vst1.32 {q11}, [%0,:128], %17 \n\t"
190
					"vst1.32 {q12}, [%0,:128], %17 \n\t"
191
					"vst1.32 {q13}, [%0,:128], %17 \n\t"
192
					"vst1.32 {q14}, [%0,:128], %17 \n\t"
193
					"vst1.32 {q15}, [%0,:128], %17 \n"
194
					: "+r" (o),
195
					  "=m" (*((f32x4_sz *)(o+ 0*os0))),
196
					  "=m" (*((f32x4_sz *)(o+ 1*os0))),
197
					  "=m" (*((f32x4_sz *)(o+ 2*os0))),
198
					  "=m" (*((f32x4_sz *)(o+ 3*os0))),
199
					  "=m" (*((f32x4_sz *)(o+ 4*os0))),
200
					  "=m" (*((f32x4_sz *)(o+ 5*os0))),
201
					  "=m" (*((f32x4_sz *)(o+ 6*os0))),
202
					  "=m" (*((f32x4_sz *)(o+ 7*os0))),
203
					  "=m" (*((f32x4_sz *)(o+ 8*os0))),
204
					  "=m" (*((f32x4_sz *)(o+ 9*os0))),
205
					  "=m" (*((f32x4_sz *)(o+10*os0))),
206
					  "=m" (*((f32x4_sz *)(o+11*os0))),
207
					  "=m" (*((f32x4_sz *)(o+12*os0))),
208
					  "=m" (*((f32x4_sz *)(o+13*os0))),
209
					  "=m" (*((f32x4_sz *)(o+14*os0))),
210
					  "=m" (*((f32x4_sz *)(o+15*os0))),
211
					: "r" (os)
212
				);
213
			} else {
214
				// slow :(
215
				__asm__ __volatile__(
216
					"vst1.32 {q0},  [%0], %17 \n\t"
217
					"vst1.32 {q1},  [%0], %17 \n\t"
218
					"vst1.32 {q2},  [%0], %17 \n\t"
219
					"vst1.32 {q3},  [%0], %17 \n\t"
220
					"vst1.32 {q4},  [%0], %17 \n\t"
221
					"vst1.32 {q5},  [%0], %17 \n\t"
222
					"vst1.32 {q6},  [%0], %17 \n\t"
223
					"vst1.32 {q7},  [%0], %17 \n\t"
224
					"vst1.32 {q8},  [%0], %17 \n\t"
225
					"vst1.32 {q9},  [%0], %17 \n\t"
226
					"vst1.32 {q10}, [%0], %17 \n\t"
227
					"vst1.32 {q11}, [%0], %17 \n\t"
228
					"vst1.32 {q12}, [%0], %17 \n\t"
229
					"vst1.32 {q13}, [%0], %17 \n\t"
230
					"vst1.32 {q14}, [%0], %17 \n\t"
231
					"vst1.32 {q15}, [%0], %17 \n"
232
					: "+r" (o),
233
					  "=m" (*((f32x4_sz *)(o+ 0*os0))),
234
					  "=m" (*((f32x4_sz *)(o+ 1*os0))),
235
					  "=m" (*((f32x4_sz *)(o+ 2*os0))),
236
					  "=m" (*((f32x4_sz *)(o+ 3*os0))),
237
					  "=m" (*((f32x4_sz *)(o+ 4*os0))),
238
					  "=m" (*((f32x4_sz *)(o+ 5*os0))),
239
					  "=m" (*((f32x4_sz *)(o+ 6*os0))),
240
					  "=m" (*((f32x4_sz *)(o+ 7*os0))),
241
					  "=m" (*((f32x4_sz *)(o+ 8*os0))),
242
					  "=m" (*((f32x4_sz *)(o+ 9*os0))),
243
					  "=m" (*((f32x4_sz *)(o+10*os0))),
244
					  "=m" (*((f32x4_sz *)(o+11*os0))),
245
					  "=m" (*((f32x4_sz *)(o+12*os0))),
246
					  "=m" (*((f32x4_sz *)(o+13*os0))),
247
					  "=m" (*((f32x4_sz *)(o+14*os0))),
248
					  "=m" (*((f32x4_sz *)(o+15*os0))),
249
					: "r" (os)
250
				);
251
			}
252
			continue;
253
		}
254
		if ( n < 2 ) {
255
			if ( al & INPUT_ALIGNED ) {
256
				// fast :)
257
				__asm__ __volatile__(
258
					"vld1.32 {q0},  [%0,:128], %1 \n"
259
					: "+r"(i)
260
					: "r" (is),
261
					  "m" (*((f32x4_sz *)(i+ 0*is0)))
262
					: ONE_Q_REG
263
				);
264
			} else {
265
					// slow :(
266
				__asm__ __volatile__(
267
					"vld1.32 {q0},  [%0], %1 \n"
268
					: "+r"(i)
269
					: "r" (is),
270
					  "m" (*((f32x4_sz *)(i+ 0*is0)))
271
					: ONE_Q_REG
272
				);
273
			}
274
			if ( al & OUTPUT_ALIGNED ) {
275
				// fast :)
276
				__asm__ __volatile__(
277
					"vst1.32 {q0},  [%0,:128], %2 \n"
278
					: "+r" (o),
279
					  "=m" (*((f32x4_sz *)(o+ 0*os0)))
280
					: "r" (os)
281
				);
282
			} else {
283
				// slow :(
284
				__asm__ __volatile__(
285
					"vst1.32 {q0},  [%0], %2 \n"
286
					: "+r" (o),
287
					  "=m" (*((f32x4_sz *)(o+ 0*os0)))
288
					: "r" (os)
289
				);
290
			}
291
			continue;
292
		}
293
		if ( n < 4 ) {
294
			chunksize = 2;
295
			if ( al & INPUT_ALIGNED ) {
296
				// fast :)
297
				__asm__ __volatile__(
298
					"vld1.32 {q0},  [%0,:128], %1 \n\t"
299
					"vld1.32 {q1},  [%0,:128], %1 \n"
300
					: "+r"(i)
301
					: "r" (is),
302
					  "m" (*((f32x4_sz *)(i+ 0*is0))),
303
					  "m" (*((f32x4_sz *)(i+ 1*is0)))
304
					: TWO_Q_REGS
305
				);
306
			} else {
307
				// slow :(
308
				__asm__ __volatile__(
309
					"vld1.32 {q0},  [%0], %1 \n\t"
310
					"vld1.32 {q1},  [%0], %1 \n"
311
					: "+r"(i)
312
					: "r" (is),
313
					  "m" (*((f32x4_sz *)(i+ 0*is0))),
314
					  "m" (*((f32x4_sz *)(i+ 1*is0)))
315
					: TWO_Q_REGS
316
				);
317
			}
318
			if ( al & OUTPUT_ALIGNED ) {
319
				// fast :)
320
				__asm__ __volatile__(
321
					"vst1.32 {q0},  [%0,:128], %3 \n\t"
322
					"vst1.32 {q1},  [%0,:128], %3 \n"
323
					: "+r" (o),
324
					  "=m" (*((f32x4_sz *)(o+ 0*os0))),
325
					  "=m" (*((f32x4_sz *)(o+ 1*os0)))
326
					: "r" (os)
327
				);
328
			} else {
329
				// slow :(
330
				__asm__ __volatile__(
331
					"vst1.32 {q0},  [%0], %3 \n\t"
332
					"vst1.32 {q1},  [%0], %3 \n"
333
					: "+r" (o),
334
					  "=m" (*((f32x4_sz *)(o+ 0*os0))),
335
					  "=m" (*((f32x4_sz *)(o+ 1*os0)))
336
					: "r" (os)
337
				);
338
			}
339
			continue;
340
		}
341
		if ( n < 8 ) {
342
			chunksize = 4;
343
			if ( al & INPUT_ALIGNED ) {
344
				// fast :)
345
				__asm__ __volatile__(
346
					"vld1.32 {q0},  [%0,:128], %1 \n\t"
347
					"vld1.32 {q1},  [%0,:128], %1 \n\t"
348
					"vld1.32 {q2},  [%0,:128], %1 \n\t"
349
					"vld1.32 {q3},  [%0,:128], %1 \n"
350
					: "+r"(i)
351
					: "r" (is),
352
					  "m" (*((f32x4_sz *)(i+ 0*is0))),
353
					  "m" (*((f32x4_sz *)(i+ 1*is0))),
354
					  "m" (*((f32x4_sz *)(i+ 2*is0))),
355
					  "m" (*((f32x4_sz *)(i+ 3*is0)))
356
					: FOUR_Q_REGS
357
				);
358
			} else {
359
					// slow :(
360
				__asm__ __volatile__(
361
					"vld1.32 {q0},  [%0], %1 \n\t"
362
					"vld1.32 {q1},  [%0], %1 \n\t"
363
					"vld1.32 {q2},  [%0], %1 \n\t"
364
					"vld1.32 {q3},  [%0], %1 \n"
365
					: "+r"(i)
366
					: "r" (is),
367
					  "m" (*((f32x4_sz *)(i+ 0*is0))),
368
					  "m" (*((f32x4_sz *)(i+ 1*is0))),
369
					  "m" (*((f32x4_sz *)(i+ 2*is0))),
370
					  "m" (*((f32x4_sz *)(i+ 3*is0)))
371
					: FOUR_Q_REGS
372
				);
373
			}
374
			if ( al & OUTPUT_ALIGNED ) {
375
				// fast :)
376
				__asm__ __volatile__(
377
					"vst1.32 {q0},  [%0,:128], %5 \n\t"
378
					"vst1.32 {q1},  [%0,:128], %5 \n\t"
379
					"vst1.32 {q2},  [%0,:128], %5 \n\t"
380
					"vst1.32 {q3},  [%0,:128], %5 \n"
381
					: "+r" (o),
382
					  "=m" (*((f32x4_sz *)(o+ 0*os0))),
383
					  "=m" (*((f32x4_sz *)(o+ 1*os0))),
384
					  "=m" (*((f32x4_sz *)(o+ 2*os0))),
385
					  "=m" (*((f32x4_sz *)(o+ 3*os0)))
386
					: "r" (os)
387
				);
388
			} else {
389
				// slow :(
390
				__asm__ __volatile__(
391
					"vst1.32 {q0},  [%0], %5 \n\t"
392
					"vst1.32 {q1},  [%0], %5 \n\t"
393
					"vst1.32 {q2},  [%0], %5 \n\t"
394
					"vst1.32 {q3},  [%0], %5 \n"
395
					: "+r" (o),
396
					  "=m" (*((f32x4_sz *)(o+ 0*os0))),
397
					  "=m" (*((f32x4_sz *)(o+ 1*os0))),
398
					  "=m" (*((f32x4_sz *)(o+ 2*os0))),
399
					  "=m" (*((f32x4_sz *)(o+ 3*os0)))
400
					: "r" (os)
401
				);
402
			}
403
			continue;
404
		}
405
		chunksize = 8;
406
		if ( al & INPUT_ALIGNED ) {
407
			// fast :)
408
			__asm__ __volatile__(
409
				"vld1.32 {q0},  [%0,:128], %1 \n\t"
410
				"vld1.32 {q1},  [%0,:128], %1 \n\t"
411
				"vld1.32 {q2},  [%0,:128], %1 \n\t"
412
				"vld1.32 {q3},  [%0,:128], %1 \n\t"
413
				"vld1.32 {q4},  [%0,:128], %1 \n\t"
414
				"vld1.32 {q5},  [%0,:128], %1 \n\t"
415
				"vld1.32 {q6},  [%0,:128], %1 \n\t"
416
				"vld1.32 {q7},  [%0,:128], %1 \n"
417
				: "+r"(i)
418
				: "r" (is),
419
				  "m" (*((f32x4_sz *)(i+ 0*is0))),
420
				  "m" (*((f32x4_sz *)(i+ 1*is0))),
421
				  "m" (*((f32x4_sz *)(i+ 2*is0))),
422
				  "m" (*((f32x4_sz *)(i+ 3*is0))),
423
				  "m" (*((f32x4_sz *)(i+ 4*is0))),
424
				  "m" (*((f32x4_sz *)(i+ 5*is0))),
425
				  "m" (*((f32x4_sz *)(i+ 6*is0))),
426
				  "m" (*((f32x4_sz *)(i+ 7*is0)))
427
				: EIGHT_Q_REGS
428
			);
429
		} else {
430
				// slow :(
431
			__asm__ __volatile__(
432
				"vld1.32 {q0},  [%0], %1 \n\t"
433
				"vld1.32 {q1},  [%0], %1 \n\t"
434
				"vld1.32 {q2},  [%0], %1 \n\t"
435
				"vld1.32 {q3},  [%0], %1 \n\t"
436
				"vld1.32 {q4},  [%0], %1 \n\t"
437
				"vld1.32 {q5},  [%0], %1 \n\t"
438
				"vld1.32 {q6},  [%0], %1 \n\t"
439
				"vld1.32 {q7},  [%0], %1 \n"
440
				: "+r"(i)
441
				: "r" (is),
442
				  "m" (*((f32x4_sz *)(i+ 0*is0))),
443
				  "m" (*((f32x4_sz *)(i+ 1*is0))),
444
				  "m" (*((f32x4_sz *)(i+ 2*is0))),
445
				  "m" (*((f32x4_sz *)(i+ 3*is0))),
446
				  "m" (*((f32x4_sz *)(i+ 4*is0))),
447
				  "m" (*((f32x4_sz *)(i+ 5*is0))),
448
				  "m" (*((f32x4_sz *)(i+ 6*is0))),
449
				  "m" (*((f32x4_sz *)(i+ 7*is0)))
450
				: EIGHT_Q_REGS
451
			);
452
		}
453
		if ( al & OUTPUT_ALIGNED ) {
454
			// fast :)
455
			__asm__ __volatile__(
456
				"vst1.32 {q0},  [%0,:128], %9 \n\t"
457
				"vst1.32 {q1},  [%0,:128], %9 \n\t"
458
				"vst1.32 {q2},  [%0,:128], %9 \n\t"
459
				"vst1.32 {q3},  [%0,:128], %9 \n\t"
460
				"vst1.32 {q4},  [%0,:128], %9 \n\t"
461
				"vst1.32 {q5},  [%0,:128], %9 \n\t"
462
				"vst1.32 {q6},  [%0,:128], %9 \n\t"
463
				"vst1.32 {q7},  [%0,:128], %9 \n"
464
				: "+r" (o),
465
				  "=m" (*((f32x4_sz *)(o+ 0*os0))),
466
				  "=m" (*((f32x4_sz *)(o+ 1*os0))),
467
				  "=m" (*((f32x4_sz *)(o+ 2*os0))),
468
				  "=m" (*((f32x4_sz *)(o+ 3*os0))),
469
				  "=m" (*((f32x4_sz *)(o+ 4*os0))),
470
				  "=m" (*((f32x4_sz *)(o+ 5*os0))),
471
				  "=m" (*((f32x4_sz *)(o+ 6*os0))),
472
				  "=m" (*((f32x4_sz *)(o+ 7*os0)))
473
				: "r" (os)
474
			);
475
		} else {
476
			// slow :(
477
			__asm__ __volatile__(
478
				"vst1.32 {q0},  [%0], %9 \n\t"
479
				"vst1.32 {q1},  [%0], %9 \n\t"
480
				"vst1.32 {q2},  [%0], %9 \n\t"
481
				"vst1.32 {q3},  [%0], %9 \n\t"
482
				"vst1.32 {q4},  [%0], %9 \n\t"
483
				"vst1.32 {q5},  [%0], %9 \n\t"
484
				"vst1.32 {q6},  [%0], %9 \n\t"
485
				"vst1.32 {q7},  [%0], %9 \n"
486
				: "+r" (o),
487
				  "=m" (*((f32x4_sz *)(o+ 0*os0))),
488
				  "=m" (*((f32x4_sz *)(o+ 1*os0))),
489
				  "=m" (*((f32x4_sz *)(o+ 2*os0))),
490
				  "=m" (*((f32x4_sz *)(o+ 3*os0))),
491
				  "=m" (*((f32x4_sz *)(o+ 4*os0))),
492
				  "=m" (*((f32x4_sz *)(o+ 5*os0))),
493
				  "=m" (*((f32x4_sz *)(o+ 6*os0))),
494
				  "=m" (*((f32x4_sz *)(o+ 7*os0)))
495
				: "r" (os)
496
			);
497
		}
498
	}
499
}
500
static inline void cpy1d_neon_2(R *I, R *O, INT n0, INT is0, INT os0) {
501
	// TODO: implement me
502
	// XXX: N.B. the cortex-a8 trm says that vld1.32 {d0}, [0,:128] is
503
	// possible. However, the instruction generates a SIGILL.
504
	// Could that be a kernel bug?
505
	/*
506
	typedef struct { char b[sizeof(float32x2_t)]; } f32x2_sz;
507
	enum { INPUT_ALIGNED = 2, OUTPUT_ALIGNED = 1 };
508
509
	volatile int chunksize = 0;
510
	volatile int al =
511
		((!((unsigned int)I % 16)&1) << 1) |
512
		 (!((unsigned int)I % 16)&1);
513
514
	volatile R *i = I, *o = O;
515
	// could be constant / immediate
516
	volatile is = is0 << 3; // 8 bytes each
517
	volatile os = os0 << 3;
518
	volatile n = n0;
519
520
	//assert( !(n0 % 2) );
521
522
	for( ; n > 0; n -= chunksize ) {
523
		if ( n >= 32 ) {
524
			chunksize = 32;
525
			if ( al & INPUT_ALIGNED ) {
526
				// fast :)
527
			} else {
528
				// slow :(
529
			}
530
			if ( al & OUTPUT_ALIGNED ) {
531
				// fast :)
532
			} else {
533
				// slow :(
534
			}
535
			continue;
536
		}
537
		if ( n < 2 ) {
538
			if ( al & INPUT_ALIGNED ) {
539
				// fast :)
540
			} else {
541
				// slow :(
542
			}
543
			if ( al & OUTPUT_ALIGNED ) {
544
				// fast :)
545
			} else {
546
				// slow :(
547
			}
548
			continue;
549
		}
550
		if ( n < 4 ) {
551
			chunksize = 2;
552
			if ( al & INPUT_ALIGNED ) {
553
				// fast :)
554
			} else {
555
				// slow :(
556
			}
557
			if ( al & OUTPUT_ALIGNED ) {
558
				// fast :)
559
			} else {
560
				// slow :(
561
			}
562
			continue;
563
		}
564
		if ( n < 8 ) {
565
			chunksize = 4;
566
			if ( al & INPUT_ALIGNED ) {
567
				// fast :)
568
			} else {
569
				// slow :(
570
			}
571
			if ( al & OUTPUT_ALIGNED ) {
572
				// fast :)
573
			} else {
574
				// slow :(
575
			}
576
			continue;
577
		}
578
		if ( n < 16 ) {
579
			chunksize = 8;
580
			if ( al & INPUT_ALIGNED ) {
581
				// fast :)
582
			} else {
583
				// slow :(
584
			}
585
			if ( al & OUTPUT_ALIGNED ) {
586
				// fast :)
587
			} else {
588
				// slow :(
589
			}
590
			continue;
591
		}
592
		chunksize = 16;
593
		if ( al & INPUT_ALIGNED ) {
594
			// fast :)
595
		} else {
596
			// slow :(
597
		}
598
		if ( al & OUTPUT_ALIGNED ) {
599
			// fast :)
600
		} else {
601
			// slow :(
602
		}
603
	}
604
	*/
605
}
606
static inline void cpy1d_neon_1(R *I, R *O, INT n0, INT is0, INT os0) {
607
	// TODO: implement me
608
}
609
#endif