-
Notifications
You must be signed in to change notification settings - Fork 2
/
filter_arm.s
447 lines (368 loc) · 13.2 KB
/
filter_arm.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
@ Pithesiser - a software synthesiser for Raspberry Pi
@ Copyright (C) 2015 Nicholas Tuckett
@
@ This program is free software: you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by
@ the Free Software Foundation, either version 3 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this program. If not, see <http:@www.gnu.org/licenses/>.
.globl filter_apply_asm
.globl filter_apply_interp_asm
.globl filter_apply_hp_asm
.globl filter_apply_interp_hp_asm
.globl __aeabi_idiv
@ r0: signed positive numerator
@ r1: signed positive denominator
idiv:
neg r1, r1
mov r2, #0
adds r0, r0, r0
.rept 32
adcs r2, r1, r2, lsl #1
subcc r2, r2, r1
adcs r0, r0, r0
.endr
mov pc,lr
@ r0: sample_data
@ r1: sample_count
@ r2: filter_state
.set ROUNDING_OFFSET, 8192
.set PRECISION, 14
.set FS_INPUT_C0, 0
.set FS_INPUT_C1, 4
.set FS_INPUT_C2, 8
.set FS_OUTPUT_C0, 12
.set FS_OUTPUT_C1, 16
.set FS_HISTORY0, 20
.set FS_HISTORY1, 24
.set FS_OUTPUT0, 28
.set FS_OUTPUT1, 32
filter_apply_asm:
stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
sub sp, #4
ldr r8,[r2, #FS_INPUT_C0] @ ic[0]
ldr r9,[r2, #FS_INPUT_C1] @ ic[1]
ldr r10,[r2, #FS_INPUT_C2] @ ic[2]
ldr r11,[r2, #FS_OUTPUT_C0] @ oc[0]
ldr r12,[r2, #FS_OUTPUT_C1] @ oc[1]
ldr r3,[r2, #FS_HISTORY0] @ h[0]
ldr r4,[r2, #FS_HISTORY1] @ h[1]
ldr r5,[r2, #FS_OUTPUT0] @ o[0]
ldr r6,[r2, #FS_OUTPUT1] @ o[1]
str r2,[sp]
b .L1
.L0:
mov r4, r3 @ h[1] = h[0]
mov r6, r5 @ o[1] = o[0]
mov r3, r2 @ h[0] = last input
mov r5, r7 @ o[0] = last output
.L1:
ldrsh r2,[r0] @ load new sample
subs r1,r1,#1 @ decrement overall count (here to avoid stall from 16-bit load above)
mul r7,r10,r4 @ h[1] * ic[2] (helps to avoid stall from 16-bit load above)
mla r7,r2,r8,r7 @ += sample * ic[0] (now use available 16-bit loaded value)
mla r7,r3,r9,r7 @ += h[0] * ic[1]
mla r7,r5,r11,r7 @ += o[0] * oc[0]
mla r7,r6,r12,r7 @ += o[1] * oc[1]
add r7, r7, #ROUNDING_OFFSET @ round new_sample
mov r7, r7, asr #PRECISION
strh r7,[r0], #2
bne .L0 @ checks r1 for zero
ldr r1, [sp]
str r2, [r1, #FS_HISTORY0] @ store final history & output state
str r3, [r1, #FS_HISTORY1]
str r7, [r1, #FS_OUTPUT0]
str r5, [r1, #FS_OUTPUT1]
add sp, #4
ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
@ r0: sample_data
@ r1: sample_count
@ r2: filter_state_current
@ r3: filter_state_last
.set S_INTERPOLANT, 0
.set S_INTERPOLATE_STEP, S_INTERPOLANT + 4
.set S_INPUT_C0, S_INTERPOLATE_STEP + 4
.set S_INPUT_C1, S_INPUT_C0 + 4
.set S_INPUT_C2, S_INPUT_C1 + 4
.set S_OUTPUT_C0, S_INPUT_C2 + 4
.set S_OUTPUT_C1, S_OUTPUT_C0 + 4
.set S_LAST_INPUT_C0, S_OUTPUT_C1 + 4
.set S_LAST_INPUT_C1, S_LAST_INPUT_C0 + 4
.set S_LAST_INPUT_C2, S_LAST_INPUT_C1 + 4
.set S_LAST_OUTPUT_C0, S_LAST_INPUT_C2 + 4
.set S_LAST_OUTPUT_C1, S_LAST_OUTPUT_C0 + 4
.set S_FRAME_SIZE, S_LAST_OUTPUT_C1 + 4
.set INTERP_PRECISION, 15
.set INTERP_ONE, 32768
filter_apply_interp_asm:
@ save registers & reserve stack space
stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
sub sp, #8
str r2, [sp]
str r3, [sp, #4]
@ Calculate interpolation step: (1 << INTERP_PRECISION) / sample_count
stmfd sp!, {r0, r1, r2, r3}
mov r0, #INTERP_ONE
bl __aeabi_idiv
mov r12, r0
ldmfd sp!, {r0, r1, r2, r3}
@ Copy state coefficients onto stack (saves a register in the loop)
ldr r4, [r3, #FS_INPUT_C0]
ldr r5, [r3, #FS_INPUT_C1]
ldr r6, [r3, #FS_INPUT_C2]
ldr r7, [r3, #FS_OUTPUT_C0]
ldr r8, [r3, #FS_OUTPUT_C1]
stmfd sp!, {r4, r5, r6, r7, r8}
ldr r4, [r2, #FS_INPUT_C0]
ldr r5, [r2, #FS_INPUT_C1]
ldr r6, [r2, #FS_INPUT_C2]
ldr r7, [r2, #FS_OUTPUT_C0]
ldr r8, [r2, #FS_OUTPUT_C1]
stmfd sp!, {r4, r5, r6, r7, r8}
@ Set up history & output into registers for temporary use
@ Can assume history is same for both last and current
ldr r7, [r3, #FS_HISTORY0] @ lh & h[0]
ldr r8, [r3, #FS_HISTORY1] @ lh & h[1]
ldr r9, [r3, #FS_OUTPUT0] @ lo[0]
ldr r10, [r3, #FS_OUTPUT1] @ lo[1]
ldr r5, [r2, #FS_OUTPUT0] @ o[0]
ldr r6, [r2, #FS_OUTPUT1] @ o[1]
@ Set interpolation values and start loop
sub r11, r11, r11
stmfd sp!, {r11, r12}
b .L3
.L2:
@ advance historical data
mov r8, r7 @ h[]
mov r7, r2
mov r10, r9 @ lo[]
mov r9, r3
mov r6, r5 @ o[]
mov r5, r4
.L3:
@ r0 -> sample data, r1 = sample count, r5 to r10 = historical data
@ r2 = current sample, r3 = sample from last filter, r4 = sample from current filter
@ r11, r12 = temp values
@ r14 could be used... (lr, saved for return)
ldrsh r2, [r0] @ load new sample to r2
@ carry out last state calculations (to r3)
@ round result & hold onto it (avoid store if possible)
ldr r11, [sp, #S_LAST_INPUT_C2]
ldr r12, [sp, #S_LAST_INPUT_C0]
mul r3, r8, r11 @ h[1] * ic[2]
ldr r11, [sp, #S_LAST_INPUT_C1]
mla r3, r2, r12, r3 @ += sample * ic[0]
ldr r12, [sp, #S_LAST_OUTPUT_C0]
mla r3, r7, r11, r3 @ += h[0] * ic[1]
ldr r11, [sp, #S_LAST_OUTPUT_C1]
mla r3, r9, r12, r3 @ += lo[0] * oc[0]
mla r3, r10, r11, r3 @ += lo[1] * oc[1]
add r3, r3, #ROUNDING_OFFSET @ round new_sample
mov r3, r3, asr #PRECISION
@ carry out current state calculations (to r4)
@ round result & store it (avoid store if possible)
ldr r11, [sp, #S_INPUT_C2]
ldr r12, [sp, #S_INPUT_C0]
mul r4, r8, r11 @ h[1] * ic[2]
ldr r11, [sp, #S_INPUT_C1]
mla r4, r2, r12, r4 @ += sample * ic[0]
ldr r12, [sp, #S_OUTPUT_C0]
mla r4, r7, r11, r4 @ += h[0] * ic[1]
ldr r11, [sp, #S_OUTPUT_C1]
mla r4, r5, r12, r4 @ += lo[0] * oc[0]
mla r4, r6, r11, r4 @ += lo[1] * oc[1]
add r4, r4, #ROUNDING_OFFSET @ round new_sample
mov r4, r4, asr #PRECISION
@ calculate interpolated result and store
ldr r11, [sp, #S_INTERPOLANT]
subs r1, r1, #1 @ decrement overall count (here to avoid stall from load above)
mul r12, r11, r4 @ i * new_sample
rsb r11, r11, #INTERP_ONE
mla r12, r11, r3, r12 @ += (1 - i) * last_sample
mov r12, r12, asr #INTERP_PRECISION
strh r12, [r0], #2
@ loop for next sample
ldr r12, [sp, #S_INTERPOLATE_STEP]
rsb r11, r11, #INTERP_ONE
add r11, r12, r11
str r11, [sp, #S_INTERPOLANT]
bne .L2 @ based on flags set from decrement of r1
@ write back historical data to state
add sp, sp, #S_FRAME_SIZE
ldr r1, [sp]
str r2,[r1, #FS_HISTORY0] @ h[0]
str r7,[r1, #FS_HISTORY1] @ h[1]
str r4,[r1, #FS_OUTPUT0] @ o[0]
str r5,[r1, #FS_OUTPUT1] @ o[1]
ldr r1, [sp, #4]
str r2,[r1, #FS_HISTORY0] @ lh[0]
str r7,[r1, #FS_HISTORY1] @ lh[1]
str r3,[r1, #FS_OUTPUT0] @ lo[0]
str r9,[r1, #FS_OUTPUT1] @ lo[1]
@ release stack space, restore registers & return
add sp, #8
ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
.set HP_ROUNDING_OFFSET, 131072
.set HP_PRECISION, 18
filter_apply_hp_asm:
stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
sub sp, #4
ldr r8,[r2, #FS_INPUT_C0] @ ic[0]
ldr r9,[r2, #FS_INPUT_C1] @ ic[1]
ldr r10,[r2, #FS_INPUT_C2] @ ic[2]
ldr r11,[r2, #FS_OUTPUT_C0] @ oc[0]
ldr r12,[r2, #FS_OUTPUT_C1] @ oc[1]
ldr r3,[r2, #FS_HISTORY0] @ h[0]
ldr r4,[r2, #FS_HISTORY1] @ h[1]
ldr r5,[r2, #FS_OUTPUT0] @ o[0]
ldr r6,[r2, #FS_OUTPUT1] @ o[1]
str r2,[sp]
b .L5
.L4:
mov r4, r3 @ h[1] = h[0]
mov r6, r5 @ o[1] = o[0]
mov r3, r2 @ h[0] = last input
mov r5, r7 @ o[0] = last output
.L5:
ldrsh r2,[r0] @ load new sample
sub r1,r1,#1 @ decrement overall count (here to avoid stall from 16-bit load above)
smull r7,r14,r10,r4 @ h[1] * ic[2] (helps to avoid stall from 16-bit load above)
smlal r7,r14,r2,r8 @ += sample * ic[0] (now use available 16-bit loaded value)
smlal r7,r14,r3,r9 @ += h[0] * ic[1]
smlal r7,r14,r5,r11 @ += o[0] * oc[0]
smlal r7,r14,r6,r12 @ += o[1] * oc[1]
adds r7,r7, #HP_ROUNDING_OFFSET @ round new_sample
adc r14,r14, #0
mov r7, r7, lsr #HP_PRECISION
orr r7, r14, lsl #32-HP_PRECISION
strh r7,[r0], #2
cmp r1, #0
bne .L4 @ checks r1 for zero
ldr r1, [sp]
str r2, [r1, #FS_HISTORY0] @ store final history & output state
str r3, [r1, #FS_HISTORY1]
str r7, [r1, #FS_OUTPUT0]
str r5, [r1, #FS_OUTPUT1]
add sp, #4
ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
mov pc, lr
filter_apply_interp_hp_asm:
@ save registers & reserve stack space
stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
sub sp, #8
str r2, [sp]
str r3, [sp, #4]
@ Calculate interpolation step: (1 << INTERP_PRECISION) / sample_count
stmfd sp!, {r0, r1, r2, r3}
mov r0, #INTERP_ONE
bl __aeabi_idiv
mov r12, r0
ldmfd sp!, {r0, r1, r2, r3}
@ Copy state coefficients onto stack (saves a register in the loop)
ldr r4, [r3, #FS_INPUT_C0]
ldr r5, [r3, #FS_INPUT_C1]
ldr r6, [r3, #FS_INPUT_C2]
ldr r7, [r3, #FS_OUTPUT_C0]
ldr r8, [r3, #FS_OUTPUT_C1]
stmfd sp!, {r4, r5, r6, r7, r8}
ldr r4, [r2, #FS_INPUT_C0]
ldr r5, [r2, #FS_INPUT_C1]
ldr r6, [r2, #FS_INPUT_C2]
ldr r7, [r2, #FS_OUTPUT_C0]
ldr r8, [r2, #FS_OUTPUT_C1]
stmfd sp!, {r4, r5, r6, r7, r8}
@ Set up history & output into registers for temporary use
@ Can assume history is same for both last and current
ldr r7, [r3, #FS_HISTORY0] @ lh & h[0]
ldr r8, [r3, #FS_HISTORY1] @ lh & h[1]
ldr r9, [r3, #FS_OUTPUT0] @ lo[0]
ldr r10, [r3, #FS_OUTPUT1] @ lo[1]
ldr r5, [r2, #FS_OUTPUT0] @ o[0]
ldr r6, [r2, #FS_OUTPUT1] @ o[1]
@ Set interpolation values and start loop
sub r11, r11, r11
stmfd sp!, {r11, r12}
b .L7
.L6:
@ advance historical data
mov r8, r7 @ h[]
mov r7, r2
mov r10, r9 @ lo[]
mov r9, r3
mov r6, r5 @ o[]
mov r5, r4
.L7:
@ r0 -> sample data, r1 = sample count, r5 to r10 = historical data
@ r2 = current sample, r3 = sample from last filter, r4 = sample from current filter
@ r11, r12 = temp values
@ r14 could be used... (lr, saved for return)
ldrsh r2, [r0] @ load new sample to r2
@ carry out last state calculations (to r3)
@ round result & hold onto it (avoid store if possible)
ldr r11, [sp, #S_LAST_INPUT_C2]
ldr r12, [sp, #S_LAST_INPUT_C0]
smull r3, r14, r8, r11 @ h[1] * ic[2]
ldr r11, [sp, #S_LAST_INPUT_C1]
smlal r3, r14, r2, r12 @ += sample * ic[0]
ldr r12, [sp, #S_LAST_OUTPUT_C0]
smlal r3, r14, r7, r11 @ += h[0] * ic[1]
ldr r11, [sp, #S_LAST_OUTPUT_C1]
smlal r3, r14, r9, r12 @ += lo[0] * oc[0]
smlal r3, r14, r10, r11 @ += lo[1] * oc[1]
adds r3, r3, #HP_ROUNDING_OFFSET @ round new_sample
adc r14, r14, #0
mov r3, r3, lsr #HP_PRECISION
orr r3, r14, lsl #32-HP_PRECISION
@ carry out current state calculations (to r4)
@ round result & store it (avoid store if possible)
ldr r11, [sp, #S_INPUT_C2]
ldr r12, [sp, #S_INPUT_C0]
smull r4, r14, r8, r11 @ h[1] * ic[2]
ldr r11, [sp, #S_INPUT_C1]
smlal r4, r14, r2, r12 @ += sample * ic[0]
ldr r12, [sp, #S_OUTPUT_C0]
smlal r4, r14, r7, r11 @ += h[0] * ic[1]
ldr r11, [sp, #S_OUTPUT_C1]
smlal r4, r14, r5, r12 @ += lo[0] * oc[0]
smlal r4, r14, r6, r11 @ += lo[1] * oc[1]
adds r4, r4, #HP_ROUNDING_OFFSET @ round new_sample
adc r14, r14, #0
mov r4, r4, lsr #HP_PRECISION
orr r4, r14, lsl #32-HP_PRECISION
@ calculate interpolated result and store
ldr r11, [sp, #S_INTERPOLANT]
subs r1, r1, #1 @ decrement overall count (here to avoid stall from load above)
mul r12, r11, r4 @ i * new_sample
rsb r11, r11, #INTERP_ONE
mla r12, r11, r3, r12 @ += (1 - i) * last_sample
mov r12, r12, asr #INTERP_PRECISION
strh r12, [r0], #2
@ loop for next sample
ldr r12, [sp, #S_INTERPOLATE_STEP]
rsb r11, r11, #INTERP_ONE
add r11, r12, r11
str r11, [sp, #S_INTERPOLANT]
bne .L6 @ based on flags set from decrement of r1
@ write back historical data to state
add sp, sp, #S_FRAME_SIZE
ldr r1, [sp]
str r2,[r1, #FS_HISTORY0] @ h[0]
str r7,[r1, #FS_HISTORY1] @ h[1]
str r4,[r1, #FS_OUTPUT0] @ o[0]
str r5,[r1, #FS_OUTPUT1] @ o[1]
ldr r1, [sp, #4]
str r2,[r1, #FS_HISTORY0] @ lh[0]
str r7,[r1, #FS_HISTORY1] @ lh[1]
str r3,[r1, #FS_OUTPUT0] @ lo[0]
str r9,[r1, #FS_OUTPUT1] @ lo[1]
@ release stack space, restore registers & return
add sp, #8
ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
mov pc,lr