Loop tiling - mratsim/laser GitHub Wiki
TODO
Nd tiling using Space-Filling Curve
-
Generating Families of Practical Fast Matrix Multiplication Algorithms,
- From FLAME/BLIS, Morton indexing (Z-order curve)
- Strassen Matrix Multiplication
- https://arxiv.org/pdf/1611.01120.pdf
-
The effect of reordering multi-dimensional array data on CPU cache utilisation
zorder64_inv:
movabsq $0x5555555555555555, %rax
pextq %rax, %rcx, %rdx
shrq %rcx
pextq %rax, %rcx, %rcx
shlq $32, %rcx
movl %edx, %eax
orq %rcx, %rax
retq
zorder64:
movl %ecx, %eax
movabsq $0x5555555555555555, %r8
pdepq %r8, %rax, %rcx
movl %edx, %eax
pdepq %r8, %rax, %rax
addq %rax, %rax
orq %rcx, %rax
retq