Create assembly routines for performance‐critical sections - retrotruestory/M1DEV GitHub Wiki
Let me show how to create optimized assembly routines for performance-critical sections in Magic-1. Here's a practical example for fast memory operations:
.cseg
.global _fast_memcpy
.global _fast_memset
.global _fast_sector_read
// Fast memory copy routine
// void fast_memcpy(void* dest, void* src, unsigned int count)
_fast_memcpy:
enter 8 ; Setup stack frame
push a ; Save registers we'll use
push b
push c
ld.16 b,12(sp) ; Load dest address
ld.16 a,14(sp) ; Load src address
ld.16 c,16(sp) ; Load count
memcopy ; Use hardware assisted copy instruction
pop c ; Restore registers
pop b
pop a
leave
ret
// Fast memory set routine
// void fast_memset(void* dest, char value, unsigned int count)
_fast_memset:
enter 8
push a
push b
push c
ld.16 b,12(sp) ; Load dest
ld.8 a,14(sp) ; Load value
ld.16 c,16(sp) ; Load count
.loop:
st.8 (b),a ; Store value
add.16 b,#1 ; Increment pointer
sub.16 c,#1 ; Decrement count
br.ne .loop ; Continue if not done
pop c
pop b
pop a
leave
ret
Then modify the C code to use these routines:
// Declare the assembly routines
extern void fast_memcpy(void* dest, void* src, unsigned int count);
extern void fast_memset(void* dest, char value, unsigned int count);
// Replace existing memory operations with optimized versions
void copy_boot_image(char* dest, char* src, int size) {
#ifdef USE_OPTIMIZED
fast_memcpy(dest, src, size);
#else
// ...existing code...
memcpy(dest, src, size);
#endif
}
void clear_buffer(char* buffer, int size) {
#ifdef USE_OPTIMIZED
fast_memset(buffer, 0, size);
#else
// ...existing code...
memset(buffer, 0, size);
#endif
}
Update the Makefile to include assembly files:
ASOURCES = fast_mem.s
AOBJECTS = $(ASOURCES:.s=.o)
$(TARGET): $(OBJECTS) $(AOBJECTS)
$(LD) $(LDFLAGS) -o $@ bcrt0.o $(OBJECTS) $(AOBJECTS) -lc crtn.o
%.o: %.s
$(AS) -o $@ $<
Key advantages of this approach:
- Uses native Magic-1 instructions like
memcopy
- Reduces function call overhead
- Optimizes register usage
- Takes advantage of hardware acceleration where available
The assembly routines will significantly improve performance for these critical operations while keeping the main code maintainable in C.