Optimizing the Sound Mixer

Let's have another look at our basic sound mixer.

/***********************************************************
 * MOD_MixChunk( target, voices )
 *
 * Mix a chunk of audio data and write it to .
 ***********************************************************/
MOD_MixChunk:

	push	{r4-r11, lr}

	push	{r0}

	ldr	r2,=MOD_MixBuffer		@ clear mixing buffer
	mov	r3, #0				@
	mov	r4, #0				@
	mov	r5, #0				@
	mov	r6, #0				@
	stmia	r2!, {r3-r6}			@
	stmia	r2!, {r3-r6}			@
	stmia	r2!, {r3-r6}			@
	stmia	r2!, {r3-r6}			@
	stmia	r2!, {r3-r6}			@
	stmia	r2!, {r3-r6}			@
	stmia	r2!, {r3-r6}			@
	stmia	r2!, {r3-r6}			@

	mov	r4, r1				@ r4 = voices
	mov	r5, #4				@ r5 = iteration counter

mixloop:
	ldmia	r4, {r6,r7,r8}			@ r6 = source
	cmp	r6, #0				@ if source == 0 then do next voice
	beq	mix_next_voice
	mov	r9, r7, lsr#24			@ r9 = read
	bic	r7, #0xFF000000			@ r7 = samples remaining
	lsr	r10, r8, #10		
	and	r10, #63			@ r10 = volume
	lsr	r11, r8, #16			@ r11 = loop
	mov	r8, r8, lsl#32-10
	mov	r8, r8, lsr#32-10		@ r8 = rate
	
	ldr	r12,=MOD_MixBuffer		@ r12 = work buffer
	mov	r14, #64

	push	{r4-r5}

@-----------------------------------------------------------------------------------------

mixingloop:	
	ldrh	r0, [r12]			@ read mix buffer
	mov	r4, r9, lsr#8			@ get integer of read position
	ldrsb	r4, [r6, r4]			@ read signed sample
	mla	r0, r4, r10, r0			@ multiply by volume and add to mix buffer
	strh	r0, [r12], #2			@ store mix buffer entry
	
	add	r9, r9, r8
	cmp	r9, r7, lsl#8
	blt	1f
	cmp	r11, #0
	beq	end_of_sample
	add	r7, r7, r6			@ remaining += source
	sub	r9, r9, r11, lsl#8		@ read -= loop
	add	r6, r6, r9, asr#8		@ source += read
	and	r9, #255			@ read &= 255
	sub	r7, r7, r6			@ remaining -= source
1:
	
	subs	r14, #1				@ decrement counter
	bne	mixingloop			@ loop

	pop	{r4-r5}				@ restore channels,iteration counter
	add	r6, r9, lsr#8			@ add read position to source
	sub	r7, r9, lsr#8			@ subtract from remaining count
	orr	r7, r7, r9, lsl#24		@ combine remaining | frac
	str	r7, [r4, #VOICE_REMAIN]		@ store
	
	b	mix_next_voice

end_of_sample:
	pop	{r4-r5}
	mov	r6, #0				@ CLEAR SOURCE

mix_next_voice:
	str	r6, [r4]			@ save source

	
	add	r4, #VOICE_SIZE			@ get next voice
	subs	r5, #1				@ decrement iterator
	bne	mixloop				@ loop

	pop	{r0}				@ pop 
	mov	r1, #64/2			@ loop 32 times
	ldr	r2,=MOD_MixBuffer
	
1:	ldr	r3, [r2], #4			@ read 2 mixbuffer entries
	lsr	r3, #8				@ pack into 1 word
	bic	r3, #0xFF00		
	orr	r3, r3, r3, lsr#8	
	strh	r3, [r0], #2			@ store word to target
	subs	r1, #1				@ decrement and loop
	bne	1b
	
	pop	{r4-r11, lr}			@ return
	bx	lr

I can tell you that 90% of our execution time is within the 'mixingloop' section!

We should do some pre-testing and get that end-testing code out of the loop. We can also squeeze some more performance out of the multiply and addition by operating on 2 samples at the same time.

Give me a few minutes (hours) to rewrite this mess.....

/***********************************************************
 * MOD_MixChunk( target, voices )
 *
 * Mix a chunk of audio data and write it to .
 ***********************************************************/
MOD_MixChunk:
	
	push	{r4-r11, lr}
	
	push	{r0}
	
	mov	r4, r1				@ r4 = voices
	mov	r5, #4				@ r5 = iteration counter
	
	adr	r0, _zero_word			@ clear mixing buffer
	ldr	r1,=MOD_MixBuffer		@
	ldr	r2,=(1<<24)|(BUFFER_SIZE/4)	@
	swi	0x0C0000			@ SWI 0Ch - CpuFastSet	

mixloop:
	ldmia	r4, {r6,r7,r8}			@ parse data from voice struct
	cmp	r6, #0				@ **if source == 0 then this is disabled
	beq	mix_next_voice			@ r6 = source
	mov	r9, r7, lsr#24			@ r7 = samples remaining
	bic	r7, #0xFF000000			@ r8 = rate
	lsr	r10, r8, #10			@ r9 = read
	and	r10, #63			@ r10 = volume
	lsr	r11, r8, #16			@ r11 = loop
	lsl	r11, #1				@
	mov	r8, r8, lsl#32-10		@ r12 = work buffer
	mov	r8, r8, lsr#32-10		@ r14 = sample counter
	ldr	r12,=MOD_MixBuffer		@
	mov	r14, #BUFFER_SIZE/2		@ 

mixloop2:
	push	{r4-r5}				@ save channels + counter
	mul	r2, r8, r14			@ <-samples required (x.8 fixed point)
	rsb	r0, r9, r7, lsl#8		@ <-samples remaining until end (x.8 fixed point)
	cmp	r2, r0				@ clip count if samples exceed length
	blt	1f				@
	mov	r2, r0				@
	mov	r1, r8				@ new count = remaining / rate
	swi	0x060000			@

	cmp	r1, #0				@ round up result
	addne	r0, #1				@
	b	2f				@
1:	mov	r0, r14				@
2:	sub	r14, r0				@ subtract count from total count
	push	{r14}				@
	mov	r14, r0				@
	
@-------------------------------------------------------------------------------
@ copy sample data into RAM
@-------------------------------------------------------------------------------
	
	add	r2, #2048			@ add 2 for safety
	lsr	r2, #8+2			@ translate result into WORD count
	ldr	r0,=DMA3SAD			@ set source address to aligned source
	bic	r1, r6, #0b11			@
	str	r1, [r0, #0]			@
	add	r1, r2,#(((1<<15)+(1<<10))<<16)	@ dmacnt= enable|immediate|32bit|inc/inc
	str	r1, [r0, #8]			@
	
	@ [--DMA Process--]
	
	and	r0, r6, #0b11			@ r6 = copied source
	ldr	r6,=MOD_SampleData		@
	add	r6, r0				@
	
@----------------------------------------------------------------------------------
@ mix the data into the mixing buffer
@----------------------------------------------------------------------------------

.macro mixd	dest
	mov	r5, r9, lsr#8			@ 1 read 2 samples and combine them
	ldrsb	r4, [r6, r5]			@ 3
	add	r9, r8				@ 1
	mov	r5, r9, lsr#8			@ 1
	ldrsb	r5, [r6, r5]			@ 3
	add	r9, r8				@ 1
	add	r4, r5, lsl#16			@ 1
	mla	\dest, r4, r10, \dest		@ 3 scale by volume and add to mix
.endm

.macro dmix1	dest
	ldrh	\dest, [r12]			@ read mix buffer
	mov	r4, r9, lsr#8			@ read sample
	ldrsb	r4, [r6, r4]			@
	add	r9, r8				@
	mla	r0, r4, r10, r0			@ mul + add
	strh	r0, [r12], #2			@ write mix buffer
.endm

	cmp	r14, #0				@ align destination
	beq	mixend				@
	tst	r12, #0b11			@
	beq	mix_aligned			@
	dmix1	r0				@ (mix 1 sample if misaligned)
	subs	r14, #1
mix_aligned:					@
	
	subs	r14, #8				@ mix 8 sample chunks
	bmi	mix8end				@
mix8:						@
	ldmia	r12, {r0-r3}			@ 6
	mixd	r0				@ 14
	mixd	r1				@ 14
	mixd	r2				@ 14
	mixd	r3				@ 14
	stmia	r12!, {r0-r3}			@ 5
	subs	r14, #8				@ 1
	bpl	mix8				@ 3
mix8end:
	adds	r14, #8				@ mix remaining amount (up to 7 iterations)
	beq	mixend				@
mix1:						@
	dmix1	r0				@
	subs	r14, #2
	bmi	mixend
	dmix1	r0
	bne	mix1				@

@-----------------------------------------------------------------------------------
mixend:
@-----------------------------------------------------------------------------------
	
	pop	{r14}
	pop	{r4-r5}				@ restore channel + counter

	ldr	r6, [r4, #VOICE_SOURCE]		@ r6 = source
	add	r6, r9, lsr#8			@ source += pos
	subs	r7, r9, lsr#8			@ remain -= pos
	and	r9, #255			@ pos &= 255 (clear integer)

	bgt	1f				@ if remaining <= 0
	cmp	r11, #0				@   if not loop
	beq	end_of_sample			@     end voice
	add	r7, r11				@   remaining += loop
	sub	r6, r11				@   source -= loop
1:

	str	r6, [r4, #VOICE_SOURCE]		@ save value

	cmp	r14, #0				@ mix more samples?
	bne	mixloop2			@
	
	orr	r7, r7, r9, lsl#24		@ combine remaining | frac
	str	r7, [r4, #VOICE_REMAIN]		@ store

	b	mix_next_voice

end_of_sample:
	mov	r6, #0
	str	r6, [r4, #VOICE_SOURCE]

mix_next_voice:
	
	add	r4, #VOICE_SIZE			@ get next voice
	subs	r5, #1				@ decrement iterator
	bne	mixloop				@ loop
	
	pop	{r0}				@ pop 
	
	mov	r1, #BUFFER_SIZE/2/8		@ loop X times
	ldr	r2,=MOD_MixBuffer
	ldr	r11,=0xFF00FF00

.macro mixdown	a, b, c
	and	\a, r11				@ bb00aa00
	orr	\a, \a, \a, lsl#8		@ bbaaaa00
	mov	\a, \a, lsr#16			@ 0000bbaa
	and	\c, \b, #0xFF00			@ 0000cc00
	orr	\a, \c, lsl#8			@ 00ccbbaa
	and	\c, \b, #0xFF000000		@ dd000000
	orr	\a, \c				@ ddccbbaa
.endm
	
1:	ldmia	r2!, {r3-r6}			@ read 8 mixbuffer entries
	mixdown	r3, r4, r12
	mixdown	r5, r6, r12
	stmia	r0!, {r3,r5}
	subs	r1, #1
	bne	1b

	pop	{r4-r11, lr}			@ return
	bx	lr

There. :)

What I did first was optimize the mixing part a bit to precalculate how much to actually mix before we reach the end of the segment or the end of the sample. Then I optimized the main mixing loop alot (notice the double-sample multiplying, and the 8-sample loop).

One other thing I did was DMA the sample data into RAM before giving it to the mixing loop, this saves us some cycles from the slow memory access time to ROM (although not so many cycles at higher resampling rates). After I was done the load was at 12%, which is kind of lame.

I figured it's probably a bunch of overhead generated from our small buffers, so I doubled the buffer sizes! That dropped the load down to 8%.

Last thing I did was optimize the 'mixdown' loop (at the end), which pushed it to around ~7%... which is still kind of lame.

One problem though is with chiptunes. The small looping samples in chiptunes tend to trigger the 'sample end' check alot, which slows them down to ~10% load :(. We could speed this up by writing a faster division method (but I'm much too lazy). Some other MOD players for GBA preprocess the data and unroll such tiny loops.

(also these measurements are with 4 channels only ...)

Advantages/Disadvantages of Unsigned Samples

One disadvantage is that if we use unsigned samples, then we won't be able to play 'raw' MODs anymore. :(

Have a closer look at the main mixing loop & macro:

.macro mixd	dest
	mov	r5, r9, lsr#8			@ 1 read 2 samples and combine them
	ldrsb	r4, [r6, r5]			@ 3
	add	r9, r8				@ 1
	mov	r5, r9, lsr#8			@ 1
	ldrsb	r5, [r6, r5]			@ 3
	add	r9, r8				@ 1
	add	r4, r5, lsl#16			@ 1
	mla	\dest, r4, r10, \dest		@ 3 scale by volume and add to mix
.endm

mix8:						@
	ldmia	r12, {r0-r3}			@ 6
	mixd	r0				@ 14
	mixd	r1				@ 14
	mixd	r2				@ 14
	mixd	r3				@ 14
	stmia	r12!, {r0-r3}			@ 5
	subs	r14, #8				@ 1
	bpl	mix8				@ 3
mix8end:

If we add up the cycle count, we get 71 cycles per 8 samples. That's 8.875 cycles per sample. If we are resampling at 16KHz, then we add another ~0.75 cyces to our number for the DMA copy. So thats ~9.625 cycles per sample (more or less).

Now, if we used unsigned samples, then we don't need sign extension during the mixing loop, so we can replace those highlighted parts with something 1 cycle faster:

.macro mixd	dest
	ldrb	r4, [r6, r9, lsr#8]		@ 3 read 2 samples and combine them
	add	r9, r8				@ 1
	ldrb	r5, [r6, r9, lsr#8]		@ 3
	add	r9, r8				@ 1
	add	r4, r5, lsl#16			@ 1
	mla	\dest, r4, r10, \dest		@ 3 scale by volume and add to mix
.endm

If we do this, then we should use only 63 cycles per 8 samples, which is 7.875 + 0.75 = 8.625. That's about 10% faster than the other routine!

But: The mixdown loop must deal with the unsigned samples. To operate on unsigned samples, what we have to do is add up the volume levels of each channel, and then subtract the "total/2" from each mixed sample to convert it to signed. Another problem here is when the sample ends; we need to mix '0' (128 unsigned) into the rest of the mixing buffer, or else we'll hear some garbage.

I don't think using unsigned samples will give us a very good advantage in this case, since we are only using 4-8 channels. One major advantage of using unsigned samples is that you can easily shift multiplied values during the mixing loop, this is critical for a mixer that handles more than 4-8 channels and keeps a good volume range.

Anyway, I just thought I would mention it, but I won't be implementing that since it would be too much trouble for this waste of time. :)

I'm getting pretty bored of this, so I think I'll finish up here. :P

Previous: Profiling and OptimizationContentsNext: The Product