IL emitted for non-capturing lambda declarations can be improved #71049
Unanswered
neon-sunset
asked this question in
Ideas
Replies: 4 comments 5 replies
-
@stephentoub for guidance. |
Beta Was this translation helpful? Give feedback.
0 replies
-
Example of the codegen for the improved pattern: C#
static IEnumerable<int> GetRange()
{
return Enumerable.Range(0, 100).Select(i => i * 2);
}
static IEnumerable<int> GetRangeOptimized()
{
return Enumerable.Range(0, 100).Select(Holder.Func);
}
static class Holder
{
public static readonly Func<int, int> Func = new(__GetRange);
private static int __GetRange(int i)
{
return i * 2;
}
} Tier 1: GetRange()
; Assembly listing for method Program:<<Main>$>g__GetRange|0_0():System.Collections.Generic.IEnumerable`1[int] (Tier1)
; Emitting BLENDED_CODE for generic ARM64 - MacOS
; Tier1 code
; optimized code
; optimized using Dynamic PGO
; fp based frame
; partially interruptible
; with Dynamic PGO: edge weights are valid, and fgCalledCount is 23378.09
; 1 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
G_M000_IG01: ;; offset=0x0000
stp fp, lr, [sp, #-0x20]!
stp x19, x20, [sp, #0x10]
mov fp, sp
G_M000_IG02: ;; offset=0x000C
movz x0, #0x14E8
movk x0, #0x4AA LSL #16
movk x0, #1 LSL #32
bl CORINFO_HELP_NEWSFAST
mov x19, x0
bl System.Environment:get_CurrentManagedThreadId():int
str w0, [x19, #0x08]
str wzr, [x19, #0x14]
mov w0, #100
str w0, [x19, #0x18]
movz x0, #0x5D50
movk x0, #7 LSL #32
ldr x1, [x0]
cbz x1, G_M000_IG05 ;; <-- manual init check
G_M000_IG03: ;; offset=0x0044
mov x0, x19
movz x2, #0x6370
movk x2, #0x4AD LSL #16
movk x2, #1 LSL #32
ldr x2, [x2]
blr x2
G_M000_IG04: ;; offset=0x005C
ldp x19, x20, [sp, #0x10]
ldp fp, lr, [sp], #0x20
ret lr
;; --- inline Func<int, int> initialization logic that cannot be optimized way ---
G_M000_IG05: ;; offset=0x0068
movz x0, #0xFCC8
movk x0, #0x4A9 LSL #16
movk x0, #1 LSL #32
bl CORINFO_HELP_NEWSFAST
mov x20, x0
movz x1, #0x5D48
movk x1, #7 LSL #32
ldr x1, [x1]
mov x0, x20
movz x2, #0x2298
movk x2, #0x4AD LSL #16
movk x2, #1 LSL #32
movz x3, #0x4210
movk x3, #0x49F LSL #16
movk x3, #1 LSL #32
ldr x3, [x3]
blr x3
movz x14, #0x5D50
movk x14, #7 LSL #32
mov x15, x20
bl CORINFO_HELP_ASSIGN_REF
mov x1, x20
b G_M000_IG03
; Total bytes of code 196 Tier 1: GetRangeOptimized()
; Assembly listing for method Program:<<Main>$>g__GetRangeOptimized|0_1():System.Collections.Generic.IEnumerable`1[int] (Tier1)
; Emitting BLENDED_CODE for generic ARM64 - MacOS
; Tier1 code
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; 1 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
G_M000_IG01: ;; offset=0x0000
stp fp, lr, [sp, #-0x20]!
str x19, [sp, #0x18]
mov fp, sp
G_M000_IG02: ;; offset=0x000C
movz x0, #0x14E8
movk x0, #0x4AA LSL #16
movk x0, #1 LSL #32
bl CORINFO_HELP_NEWSFAST
mov x19, x0
bl System.Environment:get_CurrentManagedThreadId():int
str w0, [x19, #0x08]
str wzr, [x19, #0x14]
mov w1, #100
str w1, [x19, #0x18]
movz x1, #0x5C80 ;; <-- construct the static field address
movk x1, #7 LSL #32
ldr x1, [x1] ;; <-- load Func<int, int> reference, that's it!
mov x0, x19
movz x2, #0x6370
movk x2, #0x4AD LSL #16
movk x2, #1 LSL #32
ldr x2, [x2]
blr x2
G_M000_IG03: ;; offset=0x0058
ldr x19, [sp, #0x18]
ldp fp, lr, [sp], #0x20
ret lr
; Total bytes of code 100 |
Beta Was this translation helpful? Give feedback.
2 replies
-
cc: @VSadov |
Beta Was this translation helpful? Give feedback.
0 replies
-
This was discussed before in dotnet/csharplang#6746 (reply in thread) and is unlikely to be changed in this way, dotnet/runtime#85014 was created as an alternative fix for it though. |
Beta Was this translation helpful? Give feedback.
3 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Given .NET compiler static initialization improvements throughout the years, it appears that Roslyn does not take advantage of them, emitting a JIT-unfriendly pattern for lambda declarations. In theory, this could be improved, yielding performance and codegen size wins.
Consider the following method:
Currently, it is being lowered to
Unfortunately, this completely defeats static init optimizations done when
GetRange()
reaches Tier 1 compilation.Instead, the Roslyn could possibly emit
This would allow to completely optimize away the init check in Tier 1 and lower down the cost of referencing/declaring non-capturing lambdas to just a couple of instructions. Additionally, with this form, Native AOT could theoretically start recognizing such pre-init patterns, optimizing them away completely during compilation (when interpreting cctors).
Additionally, it avoids initializing all lambdas in the class by splitting holders into separate static classes per method.
I don't know whether this is specification-compliant, but it would be really nice if it was.
Beta Was this translation helpful? Give feedback.
All reactions