Quote Originally Posted by gens View Post
please don't, just don't talk about audio processing
it's an area with many many people who don't know about it (and some are "smart" non the less, that's why i don't call myself an audiophile)
here, a low pass FIR filter
i think it's 32 tap, cant remember

Code:
format ELF64 executable
entry start

align 16
segment readable writeable

samples equ 1024

align 16
buff_in: rw samples*2
align 16
buff_out: rw samples*2

; 2^16 = 1.0
align 16
coefficients:

dw 5
dw 14
dw 35
dw 57
dw 39
dw -76
dw -328
dw -677
dw -945
dw -828
dw 16
dw 1799
dw 4428
dw 7441
dw 10109
dw 11680
dw 11680
dw 10109
dw 7441
dw 4428
dw 1799
dw 16
dw -828
dw -945
dw -677
dw -328
dw -76
dw 39
dw 57
dw 35
dw 14
dw 5

dw 705
dw 1240
dw 1434
dw 1169
dw 456
dw -550
dw -1576
dw -2287
dw -2367
dw -1609
dw 25
dw 2377
dw 5106
dw 7760
dw 9862
dw 11024
dw 11024
dw 9862
dw 7760
dw 5106
dw 2377
dw 25
dw -1609
dw -2367
dw -2287
dw -1576
dw -550
dw 456
dw 1169
dw 1434
dw 1240
dw 705

rq 10 ;just in case
tmp rq 100

sys_read equ 0
sys_write equ 1
sys_exit equ 60


align 16
segment readable executable

start:

xorps xmm7, xmm7
xorps xmm6, xmm6
xorps xmm5, xmm5
xorps xmm4, xmm4
xorps xmm3, xmm3
xorps xmm2, xmm2
xorps xmm1, xmm1
xorps xmm0, xmm0

mov [tmp], rdx
mov [tmp+8], rdx

filter:

mov rdx, samples*2*2
mov rsi, buff_in
mov rdi, 0
mov rax, sys_read
syscall
cmp rax, 0
jz end_fir

xor rax, rax

mov rcx, samples
loopy:
movaps xmm12, [coefficients]
movaps xmm13, [coefficients+16*1]
movaps xmm14, [coefficients+16*2]
movaps xmm15, [coefficients+16*3]
mov edx, [rax+buff_in] ;dx = left channel sample
mov esi, edx
shr esi, 16 ;si = right channel sample

; left channel delay line
pslldq xmm3, 2
movaps xmm10, xmm2
psrldq xmm10, 2*7
orpd xmm3, xmm10

pslldq xmm2, 2
movaps xmm10, xmm1
psrldq xmm10, 2*7
orpd xmm2, xmm10

pslldq xmm1, 2
movaps xmm10, xmm0
psrldq xmm10, 2*7
orpd xmm1, xmm10

pslldq xmm0, 2 ;bytes
movzx rdx, dx
movq xmm10, rdx
orpd xmm0, xmm10

; right channel delay line
pslldq xmm7, 2
movaps xmm10, xmm6
psrldq xmm10, 2*7
orpd xmm7, xmm10

pslldq xmm6, 2
movaps xmm10, xmm5
psrldq xmm10, 2*7
orpd xmm6, xmm10

pslldq xmm5, 2
movaps xmm10, xmm4
psrldq xmm10, 2*7
orpd xmm5, xmm10

pslldq xmm4, 2
movq xmm10, rsi
orpd xmm4, xmm10




movaps xmm8, xmm12
movaps xmm9, xmm13
movaps xmm10, xmm14
movaps xmm11, xmm15


pmaddwd xmm12, xmm0
pmaddwd xmm13, xmm1
pmaddwd xmm14, xmm2
pmaddwd xmm15, xmm3

paddd xmm12, xmm14
paddd xmm13, xmm15

paddd xmm12, xmm13

movhlps xmm13, xmm12
paddd xmm12, xmm13

movss xmm13, xmm12
psrldq xmm12, 32
paddd xmm12, xmm13



pmaddwd xmm8, xmm4
pmaddwd xmm9, xmm5
pmaddwd xmm10, xmm6
pmaddwd xmm11, xmm7

paddd xmm8, xmm10
paddd xmm9, xmm11

paddd xmm8, xmm9

movhlps xmm9, xmm8
paddd xmm8, xmm9

movss xmm9, xmm8
psrldq xmm8, 32
paddd xmm8, xmm9



movd r10d, xmm12
;sal r10d, 3 ;overflow possibility unless compensated
shr r10d, 16
mov [rax+buff_out], r10w

movq r10, xmm8
;sal r10d, 2
shr r10d, 16
mov [rax+buff_out+2], r10w

add rax, 4
dec rcx
jnz loopy

mov rdx, samples*2*2
mov rsi, buff_out
mov rdi, 1
mov rax, sys_write
syscall

jmp filter

end_fir:
mov rax, sys_exit
syscall
bash-4.2# time ./example < hell.raw >hell2.raw

real 0m2.206s
user 0m0.129s
sys 0m0.251s

where hell.raw is a 64MB, ~6.25 minutes long, 44100Hz, 16bit stereo
and most of that time is spent in read() and write()
I don't get your point, do you mean that audio processing is irrelevant since an assembly language low pass filter is spending most time in read/write and not processing?
I might have misunderstood, but I understand if you dislike hardware audio "processing" because they do things you shouldn't do to audio streams, but that's not really what true audio and audio processing in games is about..
The thing is that for one audio stream source, the best is to do *nothing* to it, but in a game, where you have up to several hundred audio streams that are triggered simultaneously, getting this represented correctly with respect to the player position and direction in a surround perspective, you kind of have to. And this probably requires quite a bit more processing time.
So I think there is definitely benefits to do this in hardware.