Skip to content

Commit 6bfd141

Browse files
committed
Generated assembly is a little better now, LLVM fixed two spurious movs
1 parent 3422eef commit 6bfd141

File tree

2 files changed

+48
-53
lines changed

2 files changed

+48
-53
lines changed

README.md

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -341,34 +341,31 @@ For example, a matrix multiplication can be tiled like so:
341341
}
342342
```
343343
344-
This generates the following machine code(\*) for the inner loop using clang 11 with -O2 -ffast-math:
344+
This generates the following machine code(\*) for the inner loop using clang 18 with -O2 -ffast-math:
345345
```assembly
346-
LBB8_7:
347-
vmovaps %ymm12, %ymm13
348-
vmovaps %ymm11, %ymm14
349-
vbroadcastss (%r13,%rcx,4), %ymm15
350-
vmovups -64(%r9,%rdi,4), %ymm12
351-
vmovups -32(%r9,%rdi,4), %ymm11
352-
vmovups (%r9,%rdi,4), %ymm0
353-
vfmadd231ps %ymm15, %ymm12, %ymm10
354-
vfmadd231ps %ymm15, %ymm11, %ymm9
355-
vfmadd231ps %ymm15, %ymm0, %ymm8
356-
vbroadcastss (%r14,%rcx,4), %ymm15
357-
vfmadd231ps %ymm15, %ymm12, %ymm7
358-
vfmadd231ps %ymm15, %ymm11, %ymm6
359-
vfmadd231ps %ymm15, %ymm0, %ymm5
360-
vbroadcastss (%rsi,%rcx,4), %ymm15
361-
vfmadd231ps %ymm15, %ymm12, %ymm4
362-
vfmadd231ps %ymm15, %ymm11, %ymm3
363-
vfmadd231ps %ymm15, %ymm0, %ymm2
364-
vbroadcastss (%rax,%rcx,4), %ymm15
365-
vfmadd213ps %ymm13, %ymm15, %ymm12
366-
vfmadd213ps %ymm14, %ymm15, %ymm11
367-
vfmadd231ps %ymm15, %ymm0, %ymm1
368-
incq %rcx
369-
addq %rbx, %rdi
370-
cmpq %rcx, %rdx
371-
jne LBB8_7
346+
vbroadcastss (%rsi,%rdi,4), %ymm12
347+
vmovups -64(%r12,%r15,4), %ymm13
348+
vmovups -32(%r12,%r15,4), %ymm14
349+
vmovups (%r12,%r15,4), %ymm15
350+
addq %rbx, %r15
351+
vfmadd231ps %ymm12, %ymm13, %ymm11
352+
vfmadd231ps %ymm12, %ymm14, %ymm10
353+
vfmadd231ps %ymm12, %ymm15, %ymm9
354+
vbroadcastss (%r8,%rdi,4), %ymm12
355+
vfmadd231ps %ymm12, %ymm13, %ymm8
356+
vfmadd231ps %ymm12, %ymm14, %ymm7
357+
vfmadd231ps %ymm12, %ymm15, %ymm6
358+
vbroadcastss (%r10,%rdi,4), %ymm12
359+
vfmadd231ps %ymm12, %ymm13, %ymm5
360+
vfmadd231ps %ymm12, %ymm14, %ymm4
361+
vfmadd231ps %ymm12, %ymm15, %ymm3
362+
vbroadcastss (%rdx,%rdi,4), %ymm12
363+
incq %rdi
364+
vfmadd231ps %ymm13, %ymm12, %ymm2
365+
vfmadd231ps %ymm14, %ymm12, %ymm1
366+
vfmadd231ps %ymm12, %ymm15, %ymm0
367+
cmpq %rdi, %r13
368+
jne .LBB8_12
372369
```
373370
This is **40-50x** faster than a naive C implementation of nested loops on my machine, and it should be within a factor of 2 of the peak possible performance.
374371

examples/linear_algebra/matrix.cpp

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -125,32 +125,30 @@ NOINLINE void multiply_ein_reduce_matrix(
125125
// to keep all of the accumulators for the output in registers. This
126126
// generates an inner loop that looks like:
127127
//
128-
// LBB14_7:
129-
// vmovaps %ymm12, %ymm13
130-
// vmovaps %ymm11, %ymm14
131-
// vbroadcastss (%r9,%rax,4), %ymm15
132-
// vmovups -64(%r11,%rcx,4), %ymm12
133-
// vmovups -32(%r11,%rcx,4), %ymm11
134-
// vmovups (%r11,%rcx,4), %ymm0
135-
// vfmadd231ps %ymm15, %ymm12, %ymm10
136-
// vfmadd231ps %ymm15, %ymm11, %ymm9
137-
// vfmadd231ps %ymm15, %ymm0, %ymm8
138-
// vbroadcastss (%r8,%rax,4), %ymm15
139-
// vfmadd231ps %ymm15, %ymm12, %ymm7
140-
// vfmadd231ps %ymm15, %ymm11, %ymm6
141-
// vfmadd231ps %ymm15, %ymm0, %ymm5
142-
// vbroadcastss (%rdx,%rax,4), %ymm15
143-
// vfmadd231ps %ymm15, %ymm12, %ymm4
144-
// vfmadd231ps %ymm15, %ymm11, %ymm3
145-
// vfmadd231ps %ymm15, %ymm0, %ymm2
146-
// vbroadcastss (%r15,%rax,4), %ymm15
147-
// vfmadd213ps %ymm13, %ymm15, %ymm12
148-
// vfmadd213ps %ymm14, %ymm15, %ymm11
149-
// vfmadd231ps %ymm15, %ymm0, %ymm1
150-
// incq %rax
151-
// addq %rsi, %rcx
152-
// cmpq %rax, %rbx
153-
// jne LBB14_7
128+
//.LBB8_12:
129+
// vbroadcastss (%rsi,%rdi,4), %ymm12
130+
// vmovups -64(%r12,%r15,4), %ymm13
131+
// vmovups -32(%r12,%r15,4), %ymm14
132+
// vmovups (%r12,%r15,4), %ymm15
133+
// addq %rbx, %r15
134+
// vfmadd231ps %ymm12, %ymm13, %ymm11
135+
// vfmadd231ps %ymm12, %ymm14, %ymm10
136+
// vfmadd231ps %ymm12, %ymm15, %ymm9
137+
// vbroadcastss (%r8,%rdi,4), %ymm12
138+
// vfmadd231ps %ymm12, %ymm13, %ymm8
139+
// vfmadd231ps %ymm12, %ymm14, %ymm7
140+
// vfmadd231ps %ymm12, %ymm15, %ymm6
141+
// vbroadcastss (%r10,%rdi,4), %ymm12
142+
// vfmadd231ps %ymm12, %ymm13, %ymm5
143+
// vfmadd231ps %ymm12, %ymm14, %ymm4
144+
// vfmadd231ps %ymm12, %ymm15, %ymm3
145+
// vbroadcastss (%rdx,%rdi,4), %ymm12
146+
// incq %rdi
147+
// vfmadd231ps %ymm13, %ymm12, %ymm2
148+
// vfmadd231ps %ymm14, %ymm12, %ymm1
149+
// vfmadd231ps %ymm12, %ymm15, %ymm0
150+
// cmpq %rdi, %r13
151+
// jne .LBB8_12
154152
//
155153
// This appears to achieve ~70% of the peak theoretical throughput
156154
// of my machine.

0 commit comments

Comments
 (0)