How To Multiply A Single Value With Vera FX

All aspects of programming on the Commander X16.
Post Reply
Manannan
Posts: 78
Joined: Fri Oct 14, 2022 7:23 am

How To Multiply A Single Value With Vera FX

Post by Manannan »

Hi All,

I have being having trouble using the multiplier in VERA FX.

I have tried to follow the examples in the manual, tutorial and forum but I have had much luck.

My understanding was that you could load some values into the cache and multiply them and they should appear in the cache?
Is that wrong?

My goal is to speed up my code by replacing a call to a multiply function written in C with a function that uses VERA to achieve the same thing.

All I want to do is be able to write a function which is able to multiply two 16 bit numbers.

Here is a minimum reproducible example:

Code: Select all

VERA_fx_cache_h = $9F2B
VERA_fx_cache_u = $9F2C 
VERA_fx_mult = $9F2C
VERA_ctrl = $9F25
VERA_fx_ctrl = $9F29
VERA_fx_accum_reset = $9F29
VERA_fx_accum = $9F29

.org $080D
.segment "STARTUP"
.segment "INIT"
.segment "ONCE"
.segment "CODE"

lda #(2 << 1)
sta VERA_ctrl

lda #%1000000 ; Enable cache write
sta VERA_fx_ctrl

lda #%00010000     ; MULT 16x16 mode
sta VERA_fx_mult

; 3. Set DCSEL = 6 to write inputs
lda #(6 << 1)
sta VERA_ctrl

lda VERA_fx_accum_reset ;Reset The Accumulator

lda #<2
sta VERA_fx_cache_l ;Load The Values Into The Cache
lda #>2
sta VERA_fx_cache_m
lda #<2
sta VERA_fx_cache_h
lda #>2
sta VERA_fx_cache_u

lda VERA_fx_accum ;Perform the multiplication 

cosmicr
Posts: 53
Joined: Tue Nov 14, 2023 4:29 am

Re: How To Multiply A Single Value With Vera FX

Post by cosmicr »

Here's my implementation from my Another World port:

Code: Select all

.macro mulx_addr addra, addrb
.scope
    ; Save original cache values
    lda FX_CACHE_L
    pha
    lda FX_CACHE_M
    pha
    lda FX_CACHE_H
    pha
    lda FX_CACHE_U
    pha

    ; DCSEL = 2 for FX control registers
    lda #(2 << 1)
    sta VERA::CTRL
    
    ; Clear FX control and enable multiplier
    stz FX_CTRL
    lda #%00010000      ; Enable multiplier
    sta FX_MULT
    
    ; DCSEL = 6 for cache registers
    lda #(6 << 1)
    sta VERA::CTRL
    
    ; Reset accumulator
    lda FX_CACHE_L ;FX_ACCUM_RESET
    
    ; Load numbers into cache
    lda addra
    sta FX_CACHE_L      ; First number low byte
    lda addra+1
    sta FX_CACHE_M      ; First number high byte (0 for 8-bit)
    lda addrb
    sta FX_CACHE_H      ; Second number low byte
    lda addrb+1
    sta FX_CACHE_U      ; Second number high byte (0 for 8-bit)
    
    ; Back to DCSEL = 2 for writing result
    lda #(2 << 1)
    sta VERA::CTRL
    
    ; Enable cache write
    lda #%01000000      ; Cache Write Enable
    sta FX_CTRL
    
    ; Set up VRAM address to $1F9B0
    lda #$B0            ; Low byte
    sta VERA::ADDR
    lda #$F9            ; Middle byte
    sta VERA::ADDR+1
    lda #$01            ; High byte (no increment)
    sta VERA::ADDR+2

    ; Trigger multiply and write
    stz VERA::DATA0
    
    ; Set increment to read result
    lda #%00010001      ; Increment 1 + high nibble of bank
    sta VERA::ADDR+2
    
    ; Read result into A (low) and X (high)
    lda VERA::DATA0     ; Low byte
    ldx VERA::DATA0     ; High byte
    
    ; Cleanup
    stz FX_MULT
    stz FX_CTRL
    
    ; Restore cache values
    ply
    sty FX_CACHE_U
    ply
    sty FX_CACHE_H
    ply
    sty FX_CACHE_M
    ply
    sty FX_CACHE_L
.endscope
.endmacro
You'd have to make a few adjustments to get the full 32-bit result, but the framework is there. You could also remove the stack stuff if you don't need it too for speed.
Manannan
Posts: 78
Joined: Fri Oct 14, 2022 7:23 am

Re: How To Multiply A Single Value With Vera FX

Post by Manannan »

cosmicr wrote: Mon Apr 28, 2025 4:39 am Here's my implementation from my Another World port:

Code: Select all

.macro mulx_addr addra, addrb
.scope
    ; Save original cache values
    lda FX_CACHE_L
    pha
    lda FX_CACHE_M
    pha
    lda FX_CACHE_H
    pha
    lda FX_CACHE_U
    pha

    ; DCSEL = 2 for FX control registers
    lda #(2 << 1)
    sta VERA::CTRL
    
    ; Clear FX control and enable multiplier
    stz FX_CTRL
    lda #%00010000      ; Enable multiplier
    sta FX_MULT
    
    ; DCSEL = 6 for cache registers
    lda #(6 << 1)
    sta VERA::CTRL
    
    ; Reset accumulator
    lda FX_CACHE_L ;FX_ACCUM_RESET
    
    ; Load numbers into cache
    lda addra
    sta FX_CACHE_L      ; First number low byte
    lda addra+1
    sta FX_CACHE_M      ; First number high byte (0 for 8-bit)
    lda addrb
    sta FX_CACHE_H      ; Second number low byte
    lda addrb+1
    sta FX_CACHE_U      ; Second number high byte (0 for 8-bit)
    
    ; Back to DCSEL = 2 for writing result
    lda #(2 << 1)
    sta VERA::CTRL
    
    ; Enable cache write
    lda #%01000000      ; Cache Write Enable
    sta FX_CTRL
    
    ; Set up VRAM address to $1F9B0
    lda #$B0            ; Low byte
    sta VERA::ADDR
    lda #$F9            ; Middle byte
    sta VERA::ADDR+1
    lda #$01            ; High byte (no increment)
    sta VERA::ADDR+2

    ; Trigger multiply and write
    stz VERA::DATA0
    
    ; Set increment to read result
    lda #%00010001      ; Increment 1 + high nibble of bank
    sta VERA::ADDR+2
    
    ; Read result into A (low) and X (high)
    lda VERA::DATA0     ; Low byte
    ldx VERA::DATA0     ; High byte
    
    ; Cleanup
    stz FX_MULT
    stz FX_CTRL
    
    ; Restore cache values
    ply
    sty FX_CACHE_U
    ply
    sty FX_CACHE_H
    ply
    sty FX_CACHE_M
    ply
    sty FX_CACHE_L
.endscope
.endmacro
You'd have to make a few adjustments to get the full 32-bit result, but the framework is there. You could also remove the stack stuff if you don't need it too for speed.
Thanks for you help. That worked.

Here is my full working example; the only change I made other than using my own variables was to zero out the VERA_ctrl variable afterwards to avoid breaking drawing code.

I am augmenting this with my existing timetable lookup multiplier which is faster but only works for multiplicands and multipliers less than or equal to 15.

Here are the rough performance stats:
C Multiplier (Including Trampoline): 400 Cycles
Vera Multiplier: 115 Cycles
Timestable Lookup Multiplier: 50

Code: Select all

VERA_fx_cache_l = $9F29
VERA_fx_cache_m = $9F2A
VERA_fx_cache_h = $9F2B
VERA_fx_cache_u = $9F2C 
VERA_fx_mult = $9F2C
VERA_ctrl = $9F25
VERA_fx_ctrl = $9F29
VERA_fx_accum_reset = $9F29
VERA_fx_accum = $9F29


VERA_addr_low     = $9F20
VERA_addr_high    = $9F21
VERA_addr_bank    = $9F22
VERA_data0        = $9F23
VERA_data1        = $9F24

.org $080D
.segment "STARTUP"
.segment "INIT"
.segment "ONCE"
.segment "CODE"

.macro mulx_addr addra, addrb
.scope
    ; DCSEL = 2 for FX control registers
    lda #(2 << 1)
    sta VERA_ctrl
    
    ; Clear FX control and enable multiplier
    stz VERA_fx_ctrl 
    lda #%00010000      ; Enable multiplier
    sta VERA_fx_mult
    
    ; DCSEL = 6 for cache registers
    lda #(6 << 1)
    sta VERA_ctrl
    
    ; Reset accumulator
    lda VERA_fx_cache_l ;FX_ACCUM_RESET
    
    ; Load numbers into cache
    lda addra
    sta VERA_fx_cache_l      ; First number low byte
    lda addra+1
    sta VERA_fx_cache_m      ; First number high byte (0 for 8-bit)
    lda addrb
    sta VERA_fx_cache_h      ; Second number low byte
    lda addrb+1
    sta VERA_fx_cache_u      ; Second number high byte (0 for 8-bit)
    
    ; Back to DCSEL = 2 for writing result
    lda #(2 << 1)
    sta VERA_ctrl
    
    ; Enable cache write
    lda #%01000000      ; Cache Write Enable
    sta VERA_fx_ctrl
    
    ; Set up VRAM address to $1F9B0
    lda #$B0            ; Low byte
    sta VERA_addr_low 
    lda #$F9            ; Middle byte
    sta VERA_addr_high
    lda #$01            ; High byte (no increment)
    sta VERA_addr_bank

    ; Trigger multiply and write
    stz VERA_data0
    
    ; Set increment to read result
    lda #%00010001      ; Increment 1 + high nibble of bank
    sta VERA_addr_bank
    
    ; Read result into A (low) and X (high)
    lda VERA_data0     ; Low byte
    ldx VERA_data0     ; High byte
    
    ; Cleanup
    stz VERA_fx_mult
    stz VERA_fx_ctrl
    stz VERA_ctrl

.endscope
.endmacro

lda #$2
sta $22
lda #$0
sta $23
lda #$4
sta $24
lda #$0
sta $25

mulx_addr $22, $24
stp

lda #$2
sta $22
lda #$0
sta $23
lda #$8
sta $24
lda #$0
sta $25

mulx_addr $22, $24
Post Reply