You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: README.md
+24-27Lines changed: 24 additions & 27 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -341,34 +341,31 @@ For example, a matrix multiplication can be tiled like so:
341
341
}
342
342
```
343
343
344
-
This generates the following machine code(\*) for the inner loop using clang 11 with -O2 -ffast-math:
344
+
This generates the following machine code(\*) for the inner loop using clang 18 with -O2 -ffast-math:
345
345
```assembly
346
-
LBB8_7:
347
-
vmovaps %ymm12, %ymm13
348
-
vmovaps %ymm11, %ymm14
349
-
vbroadcastss (%r13,%rcx,4), %ymm15
350
-
vmovups -64(%r9,%rdi,4), %ymm12
351
-
vmovups -32(%r9,%rdi,4), %ymm11
352
-
vmovups (%r9,%rdi,4), %ymm0
353
-
vfmadd231ps %ymm15, %ymm12, %ymm10
354
-
vfmadd231ps %ymm15, %ymm11, %ymm9
355
-
vfmadd231ps %ymm15, %ymm0, %ymm8
356
-
vbroadcastss (%r14,%rcx,4), %ymm15
357
-
vfmadd231ps %ymm15, %ymm12, %ymm7
358
-
vfmadd231ps %ymm15, %ymm11, %ymm6
359
-
vfmadd231ps %ymm15, %ymm0, %ymm5
360
-
vbroadcastss (%rsi,%rcx,4), %ymm15
361
-
vfmadd231ps %ymm15, %ymm12, %ymm4
362
-
vfmadd231ps %ymm15, %ymm11, %ymm3
363
-
vfmadd231ps %ymm15, %ymm0, %ymm2
364
-
vbroadcastss (%rax,%rcx,4), %ymm15
365
-
vfmadd213ps %ymm13, %ymm15, %ymm12
366
-
vfmadd213ps %ymm14, %ymm15, %ymm11
367
-
vfmadd231ps %ymm15, %ymm0, %ymm1
368
-
incq %rcx
369
-
addq %rbx, %rdi
370
-
cmpq %rcx, %rdx
371
-
jne LBB8_7
346
+
vbroadcastss (%rsi,%rdi,4), %ymm12
347
+
vmovups -64(%r12,%r15,4), %ymm13
348
+
vmovups -32(%r12,%r15,4), %ymm14
349
+
vmovups (%r12,%r15,4), %ymm15
350
+
addq %rbx, %r15
351
+
vfmadd231ps %ymm12, %ymm13, %ymm11
352
+
vfmadd231ps %ymm12, %ymm14, %ymm10
353
+
vfmadd231ps %ymm12, %ymm15, %ymm9
354
+
vbroadcastss (%r8,%rdi,4), %ymm12
355
+
vfmadd231ps %ymm12, %ymm13, %ymm8
356
+
vfmadd231ps %ymm12, %ymm14, %ymm7
357
+
vfmadd231ps %ymm12, %ymm15, %ymm6
358
+
vbroadcastss (%r10,%rdi,4), %ymm12
359
+
vfmadd231ps %ymm12, %ymm13, %ymm5
360
+
vfmadd231ps %ymm12, %ymm14, %ymm4
361
+
vfmadd231ps %ymm12, %ymm15, %ymm3
362
+
vbroadcastss (%rdx,%rdi,4), %ymm12
363
+
incq %rdi
364
+
vfmadd231ps %ymm13, %ymm12, %ymm2
365
+
vfmadd231ps %ymm14, %ymm12, %ymm1
366
+
vfmadd231ps %ymm12, %ymm15, %ymm0
367
+
cmpq %rdi, %r13
368
+
jne .LBB8_12
372
369
```
373
370
This is **40-50x** faster than a naive C implementation of nested loops on my machine, and it should be within a factor of 2 of the peak possible performance.
0 commit comments