Create assembly routines for performance‐critical sections - retrotruestory/M1DEV GitHub Wiki

Let me show how to create optimized assembly routines for performance-critical sections in Magic-1. Here's a practical example for fast memory operations:

        .cseg
        .global _fast_memcpy
        .global _fast_memset
        .global _fast_sector_read

// Fast memory copy routine
// void fast_memcpy(void* dest, void* src, unsigned int count)
_fast_memcpy:
        enter   8           ; Setup stack frame
        push    a           ; Save registers we'll use
        push    b
        push    c
        
        ld.16   b,12(sp)   ; Load dest address
        ld.16   a,14(sp)   ; Load src address
        ld.16   c,16(sp)   ; Load count
        
        memcopy            ; Use hardware assisted copy instruction
        
        pop     c          ; Restore registers
        pop     b
        pop     a
        leave
        ret

// Fast memory set routine 
// void fast_memset(void* dest, char value, unsigned int count)
_fast_memset:
        enter   8
        push    a
        push    b
        push    c
        
        ld.16   b,12(sp)   ; Load dest
        ld.8    a,14(sp)   ; Load value
        ld.16   c,16(sp)   ; Load count

.loop:  
        st.8    (b),a      ; Store value
        add.16  b,#1       ; Increment pointer
        sub.16  c,#1       ; Decrement count
        br.ne   .loop      ; Continue if not done
        
        pop     c
        pop     b  
        pop     a
        leave
        ret

Then modify the C code to use these routines:

// Declare the assembly routines
extern void fast_memcpy(void* dest, void* src, unsigned int count);
extern void fast_memset(void* dest, char value, unsigned int count);

// Replace existing memory operations with optimized versions
void copy_boot_image(char* dest, char* src, int size) {
    #ifdef USE_OPTIMIZED
        fast_memcpy(dest, src, size);
    #else
        // ...existing code...
        memcpy(dest, src, size);
    #endif
}

void clear_buffer(char* buffer, int size) {
    #ifdef USE_OPTIMIZED
        fast_memset(buffer, 0, size);
    #else
        // ...existing code...
        memset(buffer, 0, size);
    #endif
}

Update the Makefile to include assembly files:

ASOURCES = fast_mem.s
AOBJECTS = $(ASOURCES:.s=.o)

$(TARGET): $(OBJECTS) $(AOBJECTS)
    $(LD) $(LDFLAGS) -o $@ bcrt0.o $(OBJECTS) $(AOBJECTS) -lc crtn.o

%.o: %.s
    $(AS) -o $@ $<

Key advantages of this approach:

  1. Uses native Magic-1 instructions like memcopy
  2. Reduces function call overhead
  3. Optimizes register usage
  4. Takes advantage of hardware acceleration where available

The assembly routines will significantly improve performance for these critical operations while keeping the main code maintainable in C.