Optimize std_detect's caching #908

thomcc · 2020-09-14T13:52:39Z

(Hi, sorry if this is out of the blue. Also very sorry if I left something I meant to remove in there — I tried a lot of variations, and the diff looks clean to me, but I could easily just have selective blindness about it. I have a few more improvements I want to make after this too, and I'm even willing to do them as part of this patch set, but I wanted to test the water after doing this much).

Currently (prior to this patch), std_detect's Cache uses SeqCst orderings for all operations. It notes that it could actually use Relaxed, but that SeqCst is safer. When testing the feature, it performs the following pattern of loads/stores:

load(SeqCst) to test for initialization.
if uninitialized, init and store(SeqCst)
load(SeqCst) again to test for the feature.

This performs either two loads, or two loads and a store (well, two stores because there are two Caches, even when size_of::<usize> == 8 — I hope to address this but not in this patch). Ideally, this would only perform one load in the fast path. The slow (need init) path also only needs one load and one store*.

Now it performs:

load(Relaxed) if initialized test for the feature using the same load.
If not initialized, init and store(SeqCst), but test for the feature using the value we just stored in the cache (e.g, no need to load what was just written).

Improving code size / branch prediction

In doing all this, I took a peek at the assembly for code that uses the feature test macros. Unfortunately, the cache initialization code is both getting inlined and inserted into the hot part of the code.

I played around with it a bit and found that some restructuring of how that module is called is needed to fix the issue entirely. In particular, detect::cache now has a #[cold] detect_and_initialize function, which calls detect::os::detect_features directly. Previously this was passed in as a function argument into cache::test. I tried a lot of different ways of structuring this to keep the cache from needing to know about detect_features, but ultimately couldn't (and in truth, this way doesn't seem that bad to me anyway, it just seemed like something that the code was intentionally trying to keep separate).

Results

This kind of change is almost impossible to benchmark with normal tools. The issue is that these orderings are almost entirely about cache coherence protocols of the processor, so it only shows up when hammering it from a lot of threads**.

Additionally, for code this small, benchmark tools are better off measuring min cycle count (as the variation really should be quite small) as opposed to average wall time IME, but there's no good tool for this currently in Rust. In lieu of benchmark numbers, I've provided assembly before/after for x86_64 and aarch64, and llvm-mca output for x86_64.

Assembly for x86_64/aarch64

All generated using this from a secondary crate that depends on std_detect, for the following implementations:

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub fn have_sse42() -> bool {
    std_detect::is_x86_feature_detected!("sse4.2")
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub fn have_sse42_old() -> bool {
    std::is_x86_feature_detected!("sse4.2")
}
#[cfg(target_arch = "aarch64")]
pub fn have_crypto() -> bool {
    std_detect::is_aarch64_feature_detected!("crypto")
}
#[cfg(target_arch = "aarch64")]
pub fn have_crypto_old() -> bool {
    std::is_aarch64_feature_detected!("crypto")
}

After for `x86_64-unknown-linux-gnu`

scratch::have_sse42:
    push    rax
    mov     rax, qword, ptr, [rip, +, _ZN10std_detect6detect5cache5CACHE17h54e088e0fa255954E@GOTPCREL]
    mov     rax, qword, ptr, [rax]
    cmp     rax, -1
    je      .LBB0_1
    shr     eax, 11
    and     eax, 1
    pop     rcx
    ret
.LBB0_1:
    call    qword, ptr, [rip, +,   _ZN10std_detect6detect9check_for10initialize17h90342340e8262faaE@GOTPCREL]
    shr     eax, 11
    and     eax, 1
    pop     rcx
    ret

Before for `x86_64-unknown-linux-gnu`

scratch::have_sse42_old:
    push    rbx
    mov     rbx, qword, ptr, [rip, +, _ZN3std10std_detect6detect5cache5CACHE17h21d6509a8df48fc7E@GOTPCREL]
    mov     rax, qword, ptr, [rbx]
    cmp     rax, -1
    jne     .LBB1_2
    call    qword, ptr, [rip, +, _ZN3std10std_detect6detect2os15detect_features17hb50d5a3ceff96a99E@GOTPCREL]
    movabs  rcx, 9223372036854775807
    and     rcx, rax
    xchg    qword, ptr, [rbx], rcx
    shr     rax, 63
    xchg    qword, ptr, [rbx, +, 8], rax
.LBB1_2:
    mov     rax, qword, ptr, [rbx]
    shr     eax, 11
    and     eax, 1
    pop     rbx
    ret

After for `aarch64-pc-windows-msvc`

scratch::have_crypto:
 str     x30, [sp, #-16]!
 adrp    x8, __imp__ZN10std_detect6detect5cache5CACHE17h03a1a617da7ed3b2E
 ldr     x8, [x8, __imp__ZN10std_detect6detect5cache5CACHE17h03a1a617da7ed3b2E]
 ldr     x0, [x8]
 cmn     x0, #1
 b.eq    .LBB0_2
.LBB0_1:
 ubfx    x0, x0, #6, #1
 ldr     x30, [sp], #16
 ret
.LBB0_2:
 bl      std_detect::detect::check_for::initialize
 b       .LBB0_1

Before for `aarch64-pc-windows-msvc`

scratch::have_crypto_old:
    str     x30, [sp, #-16]!
    str     x19, [sp, #8]
    adrp    x19, __imp__ZN3std10std_detect6detect5cache5CACHE17hcf36ca1b2a97b61fE
    ldr     x19, [x19, __imp__ZN3std10std_detect6detect5cache5CACHE17hcf36ca1b2a97b61fE]
    ldar    x8, [x19]
    cmn     x8, #1
    b.ne    .LBB1_2
    bl      std::std_detect::detect::os::detect_features
    and     x8, x0, #0x7fffffffffffffff
    lsr     x9, x0, #63
    add     x10, x19, #8
    stlr    x8, [x19]
    stlr    x9, [x10]
.LBB1_2:
    ldar    x8, [x19]
    ubfx    x0, x8, #6, #1
    ldr     x19, [sp, #8]
    ldr     x30, [sp], #16
    ret

But it's worth noting: if we used this from a context where inlining would be possible (eventually the stdlib?), the before becomes this (🙀)

Before for `aarch64-pc-windows-msvc` from inside `std_detect` (LLVM making bad inlining decisions)

std_detect::have_crypto_old:
    str     x30, [sp, #-32]!
    stp     x19, x20, [sp, #16]
    adrp    x19, _ZN10std_detect6detect5cache5CACHE17h03a1a617da7ed3b2E
    add     x19, x19, _ZN10std_detect6detect5cache5CACHE17h03a1a617da7ed3b2E
    ldar    x8, [x19]
    cmn     x8, #1
    b.ne    .LBB2_2
    mov     w0, #19
    bl      IsProcessorFeaturePresent
    cmp     w0, #0
    mov     w0, #31
    cset    w20, ne
    bl      IsProcessorFeaturePresent
    orr     x8, x20, #0x20
    cmp     w0, #0
    mov     w0, #30
    csel    x20, x20, x8, eq
    bl      IsProcessorFeaturePresent
    orr     x8, x20, #0x40
    cmp     w0, #0
    mov     w0, #30
    csel    x20, x20, x8, eq
    bl      IsProcessorFeaturePresent
    orr     x8, x20, #0x2
    cmp     w0, #0
    csel    x8, x20, x8, eq
    add     x9, x19, #8
    and     x10, x8, #0x7fffffffffffffff
    lsr     x8, x8, #63
    stlr    x10, [x19]
    stlr    x8, [x9]
.LBB2_2:
    ldar    x8, [x19]
    ubfx    x0, x8, #6, #1
    ldp     x19, x20, [sp, #16]
    ldr     x30, [sp], #32
    ret

(On the bright side, this is what made me realize I needed to include from a separate crate to generate this assembly, which also showed me that for some reason a well-placed core::intrinsics::unlikely isn't sufficient 😔).

`llvm-mca` output

llvm-mca is fairly arcane, but does do a pretty good job at telling you which of two small hard-to-benchmark snippets is easier on the processor https://llvm.org/docs/CommandGuide/llvm-mca.html

The llvm-mca output for this identifies the resource interference from the aggressive synchronization in the old code, and spits out some numbers about cycles/uOps (which are always hard to really know what to make of, since it's modeling something out-of-order but without a model for branch prediction).

That said, it's pretty confidently saying it's a decent improvement.

Old: Instructions: 2100, Total Cycles: 2789, Total uOps: 4800.

Iterations: 100 Instructions: 2100 Total Cycles: 2789 Total uOps: 4800

Dispatch Width:    6
uOps Per Cycle:    1.72
IPC:               0.75
Block RThroughput: 8.0


Cycles with backend pressure increase [ 21.84% ]
Throughput Bottlenecks: 
  Resource Pressure       [ 3.48% ]
  - SKLPort0  [ 3.44% ]
  - SKLPort1  [ 3.44% ]
  - SKLPort2  [ 0.04% ]
  - SKLPort3  [ 0.04% ]
  - SKLPort4  [ 3.41% ]
  - SKLPort5  [ 3.44% ]
  - SKLPort6  [ 3.44% ]
  Data Dependencies:      [ 21.84% ]
  - Register Dependencies [ 18.36% ]
  - Memory Dependencies   [ 11.12% ]

Critical sequence based on the simulation:

              Instruction                                 Dependency Information
 +----< 13.   xchgq     %rax, 8(%rbx)
 |
 |    < loop carried > 
 |
 +----> 0.    pushq     %rbp                              ## MEMORY dependency.
 |      1.    movq      %rsp, %rbp
 +----> 2.    pushq     %rbx                              ## REGISTER dependency:  %rsp
 +----> 3.    pushq     %rax                              ## REGISTER dependency:  %rsp
 |      4.    movq      __ZN3std10std_detect6detect5cache5CACHE17hbd2cb189c7a584f1E@GOTPCREL(%rip), %rbx
 |      5.    movq      (%rbx), %rax
 |      6.    cmpq      $-1, %rax
 |      7.    jne       LBB1_2
 +----> 8.    callq     __ZN3std10std_detect6detect2os15detect_features17he766c187ced4bd97E ## REGISTER     dependency:  %rsp
 |      9.    movabsq   $9223372036854775807, %rcx
 |      10.   andq      %rax, %rcx
 +----> 11.   xchgq     %rcx, (%rbx)                      ## RESOURCE interference:  SKLPort4 [ probability: 98% ]
 |      12.   shrq      $63, %rax
 +----> 13.   xchgq     %rax, 8(%rbx)                     ## MEMORY dependency.
 |      14.   movq      (%rbx), %rax
 |      15.   shrl      $11, %eax
 |      16.   andl      $1, %eax
 |      17.   addq      $8, %rsp
 |      18.   popq      %rbx
 |      19.   popq      %rbp
 |      20.   retq
 |
 |    < loop carried > 
 |
 +----> 0.    pushq     %rbp                              ## MEMORY dependency.


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 3      2     1.00           *            pushq %rbp
 1      1     0.25                        movq  %rsp, %rbp
 3      2     1.00           *            pushq %rbx
 3      2     1.00           *            pushq %rax
 1      5     0.50    *                   movq      __ZN3std10std_detect6detect5cache5CACHE17hbd2cb189c7a584f1E@GOTPCREL(%rip), %rbx
 1      5     0.50    *                   movq  (%rbx), %rax
 1      1     0.25                        cmpq  $-1, %rax
 1      1     0.50                        jne   LBB1_2
 4      3     1.00                        callq     __ZN3std10std_detect6detect2os15detect_features17he766c187ced4bd97E
 1      1     0.25                        movabsq       $9223372036854775807, %rcx
 1      1     0.25                        andq  %rax, %rcx
 8      10    1.25    *      *            xchgq %rcx, (%rbx)
 1      1     0.50                        shrq  $63, %rax
 8      10    1.25    *      *            xchgq %rax, 8(%rbx)
 1      5     0.50    *                   movq  (%rbx), %rax
 1      1     0.50                        shrl  $11, %eax
 1      1     0.25                        andl  $1, %eax
 1      1     0.25                        addq  $8, %rsp
 2      6     0.50    *                   popq  %rbx
 2      6     0.50    *                   popq  %rbp
 3      7     1.00                  U     retq


Dynamic Dispatch Stall Cycles:
RAT     - Register unavailable:                      0
RCU     - Retire tokens unavailable:                 1681  (60.3%)
SCHEDQ  - Scheduler full:                            0
LQ      - Load queue full:                           0
SQ      - Store queue full:                          0
GROUP   - Static restrictions on the dispatch group: 0


Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
[# dispatched], [# cycles]
 0,              1789  (64.1%)
 1,              99  (3.5%)
 3,              101  (3.6%)
 4,              201  (7.2%)
 6,              599  (21.5%)


Schedulers - number of cycles where we saw N micro opcodes issued:
[# issued], [# cycles]
 0,          1583  (56.8%)
 1,          298  (10.7%)
 2,          198  (7.1%)
 3,          116  (4.2%)
 4,          103  (3.7%)
 5,          195  (7.0%)
 6,          96  (3.4%)
 8,          102  (3.7%)
 9,          1  (0.0%)
 10,          97  (3.5%)

Scheduler's queue usage:
[1] Resource name.
[2] Average number of used buffer entries.
[3] Maximum number of used buffer entries.
[4] Total number of buffer entries.

 [1]            [2]        [3]        [4]
SKLPortAny       13         38         60


Retire Control Unit - number of cycles where we saw N instructions retired:
[# retired], [# cycles]
 0,           2681  (96.1%)
 1,           8  (0.3%)
 13,          1  (0.0%)
 21,          99  (3.5%)

Total ROB Entries:                224
Max Used ROB Entries:             223  ( 99.6% )
Average Used ROB Entries per cy:  209  ( 93.3% )


Register File statistics:
Total number of mappings created:    2500
Max number of mappings used:         118


Resources:
[0]   - SKLDivider
[1]   - SKLFPDivider
[2]   - SKLPort0
[3]   - SKLPort1
[4]   - SKLPort2
[5]   - SKLPort3
[6]   - SKLPort4
[7]   - SKLPort5
[8]   - SKLPort6
[9]   - SKLPort7


Resource pressure per iteration:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 -      -     6.48   7.44   4.99   5.01   6.00   7.58   6.50   4.00   

Resource pressure by instruction:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 -      -      -     0.50   0.01    -     1.00   0.48   0.02   0.99   pushq     %rbp
 -      -     0.47   0.49    -      -      -     0.03   0.01    -     movq      %rsp, %rbp
 -      -     0.49   0.03    -     0.97   1.00   0.48    -     0.03   pushq     %rbx
 -      -     0.49   0.47   0.02    -     1.00   0.02   0.02   0.98   pushq     %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movq          __ZN3std10std_detect6detect5cache5CACHE17hbd2cb189c7a584f1E@GOTPCREL(%rip), %rbx
 -      -      -      -     0.98   0.02    -      -      -      -     movq      (%rbx), %rax
 -      -     0.46   0.05    -      -      -     0.02   0.47    -     cmpq      $-1, %rax
 -      -     0.54    -      -      -      -      -     0.46    -     jne       LBB1_2
 -      -     0.50   0.03    -     0.98   1.00   0.50   0.97   0.02   callq         __ZN3std10std_detect6detect2os15detect_features17he766c187ced4bd97E
 -      -     0.02   0.48    -      -      -     0.49   0.01    -     movabsq   $9223372036854775807, %rcx
 -      -     0.05   0.02    -      -      -     0.47   0.46    -     andq      %rax, %rcx
 -      -     1.00   1.44   0.99   0.03   1.00   1.56   1.00   0.98   xchgq     %rcx, (%rbx)
 -      -     0.48    -      -      -      -      -     0.52    -     shrq      $63, %rax
 -      -     1.00   1.44   0.02   0.98   1.00   1.56   1.00   1.00   xchgq     %rax, 8(%rbx)
 -      -      -      -     0.02   0.98    -      -      -      -     movq      (%rbx), %rax
 -      -     0.47    -      -      -      -      -     0.53    -     shrl      $11, %eax
 -      -      -     0.50    -      -      -     0.50    -      -     andl      $1, %eax
 -      -     0.01   0.50    -      -      -     0.47   0.02    -     addq      $8, %rsp
 -      -      -     0.52   0.02   0.98    -     0.47   0.01    -     popq      %rbx
 -      -     0.50   0.48   0.98   0.02    -     0.02    -      -     popq      %rbp
 -      -      -     0.49   0.96   0.04    -     0.51   1.00    -     retq


Timeline view:
                    0123456789          0123456789          0123456789          0123456789
Index     0123456789          0123456789          0123456789          0123456789          

[0,0]     DeeER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq      %rbp
[0,1]     D==eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq       %rsp, %rbp
[0,2]     .D=eeER   .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq      %rbx
[0,3]     .D===eeER .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq      %rax
[0,4]     . DeeeeeER.    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq           __ZN3std10std_detect6detect5cache5CACHE17hbd2cb189c7a584f1E@GOTPCREL(%rip), %rbx
[0,5]     . D=====eeeeeER.    .    .    .    .    .    .    .    .    .    .    .    .   .   movq       (%rbx),     %rax
[0,6]     . D==========eER    .    .    .    .    .    .    .    .    .    .    .    .   .   cmpq       $-1, %rax
[0,7]     . D===========eER   .    .    .    .    .    .    .    .    .    .    .    .   .   jne        LBB1_2


Average Wait times (based on the timeline view):
[0]: Executions
[1]: Average time spent waiting in a scheduler's queue
[2]: Average time spent waiting in a scheduler's queue while ready
[3]: Average time elapsed from WB until retire stage

      [0]    [1]    [2]    [3]
0.     10    35.4   0.1    68.6      pushq      %rbp
1.     10    36.5   0.0    67.7      movq       %rsp, %rbp
2.     10    36.4   0.0    66.8      pushq      %rbx
3.     10    37.5   0.0    65.0      pushq      %rax
4.     10    1.1    1.1    98.4      movq           __ZN3std10std_detect6detect5cache5CACHE17hbd2cb189c7a584f1E@GOTPCREL(%rip), %rbx
5.     10    6.5    0.4    93.5      movq       (%rbx), %rax
6.     10    11.6   0.1    92.5      cmpq       $-1, %rax
7.     10    11.7   0.0    91.6      jne        LBB1_2
8.     10    38.4   0.0    0.0       callq          __ZN3std10std_detect6detect2os15detect_features17he766c187ced4bd97E
9.     10    1.1    1.1    136.3     movabsq    $9223372036854775807, %rcx
10.    10    9.7    0.1    126.8     andq       %rax, %rcx
11.    10    38.2   1.0    88.3      xchgq      %rcx, (%rbx)
12.    10    8.0    0.4    126.5     shrq       $63, %rax
13.    10    46.2   0.0    78.3      xchgq      %rax, 8(%rbx)
14.    10    1.1    0.9    127.4     movq       (%rbx), %rax
15.    10    7.2    1.1    125.3     shrl       $11, %eax
16.    10    8.2    0.0    124.3     andl       $1, %eax
17.    10    33.6   0.1    98.9      addq       $8, %rsp
18.    10    33.8   0.2    92.7      popq       %rbx
19.    10    39.9   0.1    86.6      popq       %rbp
20.    10    1.0    1.0    108.4     retq

New: Instructions: 1500, Total Cycles: 1497, Total uOps: 2600

    Iterations:        100
    Instructions:      1500
    Total Cycles:      1497
    Total uOps:        2600
    
    Dispatch Width:    6
    uOps Per Cycle:    1.74
    IPC:               1.00
    Block RThroughput: 4.3
    
    
    Cycles with backend pressure increase [ 32.73% ]
    Throughput Bottlenecks: 
      Resource Pressure       [ 0.33% ]
      - SKLPort0  [ 0.13% ]
      - SKLPort1  [ 0.13% ]
      - SKLPort5  [ 0.13% ]
      - SKLPort6  [ 0.33% ]
      Data Dependencies:      [ 32.73% ]
      - Register Dependencies [ 32.73% ]
      - Memory Dependencies   [ 19.24% ]
    
    Critical sequence based on the simulation:
    
                  Instruction                                 Dependency Information
     +----< 13.   popq      %rbp
     |
     |    < loop carried > 
     |
     +----> 0.    pushq     %rbp                              ## REGISTER dependency:  %rsp
     |      1.    movq      %rsp, %rbp
     |      2.    movq      __ZN10std_detect6detect5cache5CACHE17h241672bb2036a044E@GOTPCREL(%rip), %rax
     |      3.    movq      (%rax), %rax
     |      4.    cmpq      $-1, %rax
     |      5.    je        LBB0_1
     |      6.    shrl      $11, %eax
     |      7.    andl      $1, %eax
     +----> 8.    popq      %rbp                              ## REGISTER dependency:  %rsp
     |      9.    retq
     |      10.   callq     __ZN10std_detect6detect5cache21detect_and_initialize17h57f653ea1a714a33E
     |      11.   shrl      $11, %eax
     |      12.   andl      $1, %eax
     +----> 13.   popq      %rbp                              ## REGISTER dependency:  %rsp
     |      14.   retq
     |
     |    < loop carried > 
     |
     +----> 0.    pushq     %rbp                              ## REGISTER dependency:  %rsp
    
    
    Instruction Info:
    [1]: #uOps
    [2]: Latency
    [3]: RThroughput
    [4]: MayLoad
    [5]: MayStore
    [6]: HasSideEffects (U)
    
    [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
     3      2     1.00           *            pushq %rbp
     1      1     0.25                        movq  %rsp, %rbp
     1      5     0.50    *                   movq  __ZN10std_detect6detect5cache5CACHE17h241672bb2036a044E@GOTPCREL    (%rip), %rax
     1      5     0.50    *                   movq  (%rax), %rax
     1      1     0.25                        cmpq  $-1, %rax
     1      1     0.50                        je    LBB0_1
     1      1     0.50                        shrl  $11, %eax
     1      1     0.25                        andl  $1, %eax
     2      6     0.50    *                   popq  %rbp
     3      7     1.00                  U     retq
     4      3     1.00                        callq     __ZN10std_detect6detect5cache21detect_and_initialize17h57f653ea1a714a33E
     1      1     0.50                        shrl  $11, %eax
     1      1     0.25                        andl  $1, %eax
     2      6     0.50    *                   popq  %rbp
     3      7     1.00                  U     retq
    
    
    Dynamic Dispatch Stall Cycles:
    RAT     - Register unavailable:                      0
    RCU     - Retire tokens unavailable:                 965  (64.5%)
    SCHEDQ  - Scheduler full:                            0
    LQ      - Load queue full:                           0
    SQ      - Store queue full:                          0
    GROUP   - Static restrictions on the dispatch group: 0
    
    
    Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
    [# dispatched], [# cycles]
     0,              997  (66.6%)
     3,              9  (0.6%)
     5,              373  (24.9%)
     6,              118  (7.9%)
    
    
    Schedulers - number of cycles where we saw N micro opcodes issued:
    [# issued], [# cycles]
     0,          521  (34.8%)
     1,          282  (18.8%)
     2,          190  (12.7%)
     3,          293  (19.6%)
     4,          105  (7.0%)
     5,          2  (0.1%)
     6,          100  (6.7%)
     7,          3  (0.2%)
     8,          1  (0.1%)
    
    Scheduler's queue usage:
    [1] Resource name.
    [2] Average number of used buffer entries.
    [3] Maximum number of used buffer entries.
    [4] Total number of buffer entries.
    
     [1]            [2]        [3]        [4]
    SKLPortAny       11         45         60
    
    
    Retire Control Unit - number of cycles where we saw N instructions retired:
    [# retired], [# cycles]
     0,           1391  (92.9%)
     1,           5  (0.3%)
     5,           2  (0.1%)
     15,          99  (6.6%)
    
    Total ROB Entries:                224
    Max Used ROB Entries:             224  ( 100.0% )
    Average Used ROB Entries per cy:  208  ( 92.9% )
    
    
    Register File statistics:
    Total number of mappings created:    1700
    Max number of mappings used:         145
    
    
    Resources:
    [0]   - SKLDivider
    [1]   - SKLFPDivider
    [2]   - SKLPort0
    [3]   - SKLPort1
    [4]   - SKLPort2
    [5]   - SKLPort3
    [6]   - SKLPort4
    [7]   - SKLPort5
    [8]   - SKLPort6
    [9]   - SKLPort7
    
    
    Resource pressure per iteration:
    [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
     -      -     3.99   3.96   3.01   3.01   2.00   3.99   4.06   1.98   
    
    Resource pressure by instruction:
    [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
     -      -     0.02   0.92   0.01    -     1.00   0.03   0.03   0.99   pushq     %rbp
     -      -      -     0.04    -      -      -     0.94   0.02    -     movq      %rsp, %rbp
     -      -      -      -     0.94   0.06    -      -      -      -     movq          __ZN10std_detect6detect5cache5CACHE17h241672bb2036a044E@GOTPCREL(%rip), %rax
     -      -      -      -     0.95   0.05    -      -      -      -     movq      (%rax), %rax
     -      -     0.93   0.02    -      -      -     0.03   0.02    -     cmpq      $-1, %rax
     -      -     0.98    -      -      -      -      -     0.02    -     je        LBB0_1
     -      -     0.05    -      -      -      -      -     0.95    -     shrl      $11, %eax
     -      -      -     0.04    -      -      -     0.96    -      -     andl      $1, %eax
     -      -     0.03   0.93   0.06   0.94    -     0.03   0.01    -     popq      %rbp
     -      -     0.02   0.03   0.04   0.96    -     0.95   1.00    -     retq
     -      -     0.94   0.04   0.01    -     1.00   0.95   0.07   0.99   callq         __ZN10std_detect6detect5cache21detect_and_initialize17h57f653ea1a714a33E
     -      -     0.06    -      -      -      -      -     0.94    -     shrl      $11, %eax
     -      -     0.01   0.96    -      -      -     0.03    -      -     andl      $1, %eax
     -      -     0.02   0.95   0.05   0.95    -     0.03    -      -     popq      %rbp
     -      -     0.93   0.03   0.95   0.05    -     0.04   1.00    -     retq
    
    
    Timeline view:
                        0123456789          0123456789          0123456789          0123456789
    Index     0123456789          0123456789          0123456789          0123456789          
    
    [0,0]     DeeER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq      %rbp
    [0,1]     D==eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq       %rsp, %rbp
    [0,2]     DeeeeeER  .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq           __ZN10std_detect6detect5cache5CACHE17h241672bb2036a044E@GOTPCREL(%rip), %rax
    [0,3]     D=====eeeeeER  .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq       (%rax),     %rax
    [0,4]     .D=========eER .    .    .    .    .    .    .    .    .    .    .    .    .   .   cmpq       $-1, %rax
    [0,5]     .D==========eER.    .    .    .    .    .    .    .    .    .    .    .    .   .   je LBB0_1
    [0,6]     .D=========eE-R.    .    .    .    .    .    .    .    .    .    .    .    .   .   shrl       $11, %eax
    [0,7]     .D==========eER.    .    .    .    .    .    .    .    .    .    .    .    .   .   andl       $1, %eax
    [0,8]     .D=eeeeeeE----R.    .    .    .    .    .    .    .    .    .    .    .    .   .   popq       %rbp
    [0,9]     . DeeeeeeeE---R.    .    .    .    .    .    .    .    .    .    .    .    .   .   retq
    
    
    Average Wait times (based on the timeline view):
    [0]: Executions
    [1]: Average time spent waiting in a scheduler's queue
    [2]: Average time spent waiting in a scheduler's queue while ready
    [3]: Average time elapsed from WB until retire stage

          [0]    [1]    [2]    [3]
    0.     10    41.5   0.1    82.8      pushq      %rbp
    1.     10    43.5   0.0    81.9      movq       %rsp, %rbp
    2.     10    1.1    1.1    120.5     movq       __ZN10std_detect6detect5cache5CACHE17h241672bb2036a044E@GOTPCREL    (%rip), %rax
    3.     10    5.6    0.1    110.0     movq       (%rax), %rax
    4.     10    9.7    0.0    109.1     cmpq       $-1, %rax
    5.     10    10.7   0.0    108.2     je LBB0_1
    6.     10    9.8    0.1    109.1     shrl       $11, %eax
    7.     10    10.9   0.1    108.0     andl       $1, %eax
    8.     10    36.0   0.0    77.8      popq       %rbp
    9.     10    1.8    1.8    110.1     retq
    10.    10    40.1   0.0    0.0       callq          __ZN10std_detect6detect5cache21detect_and_initialize17h57f653ea1a714a33E
    11.    10    9.9    0.0    129.2     shrl       $11, %eax
    12.    10    10.9   0.0    128.2     andl       $1, %eax
    13.    10    39.1   0.0    94.0      popq       %rbp
    14.    10    1.4    1.4    130.7     retq
           10    18.1   0.3    100.0     total

* I don't care as much about the need-init path, and might consider re-loading with a stronger ordering to avoid any chance of re-querying features. That said, in practice on most hardware this shouldn't happen because the SeqCst writes in the store will effectively "publish" the cache lines containing the changes.

(And to be clear: "in practice shouldn't happen" is probably good enough for a cache — if we need to guarantee that only one initialization happens at a time we'd need a lock anyway)

** Some motivation here is wanting to use it in optimized functions that any/every thread will conceivably call (Admittedly, the obvious contenders here are in libcore and not libstd, but one thing at a time...). Anyway, using overly strict ordering in those cases would absolutely cause performance issues.

rust-highfive · 2020-09-14T13:52:43Z

Thanks for the pull request, and welcome! The Rust team is excited to review your changes, and you should hear from @Amanieu (or someone else) soon.

If any changes to this PR are deemed necessary, please add them as extra commits. This ensures that the reviewer can see what has changed since they last reviewed the code. Due to the way GitHub handles out-of-date commits, this should also make it reasonably obvious what issues have or haven't been addressed. Large or tricky changes may require several passes of review and changes.

Please see the contribution instructions for more information.

This allows modifications made to it, such as env overrides, to be handled properly (immediately).

thomcc · 2020-09-14T15:45:56Z

For clarity, here are the things I don't really like about the cache code right now (still), which I'm referring to a few places above when I say "I have other things I'd like to do later":

detect::cache::{initialize, do_initialize, detect_and_initialize} all existing. At the very least the names are getting less obvious.
Cache should just use an AtomicU64 on targets with decent 64-bit atomic ops. I think this is more than target_has_atomic though, (IIRC 32-bit arm's 64 bit atomic ops are vastly slower than 32bit, but exist). A starting point would still be 64 bit targets.
Cache vs [Cache; N] vs Initializer is all very wonky and feels like it made more sense when there was one cache (if that ever was the case).
(Stretch goal would be finding a way for x86/x86_64 parts of this (which require no std support AFAICT) to be used from inside libcore... Someday...)

(It feels like this code hasn't gotten a lot of love/attention, tbh, which is fine, but yeah, would like to help fix that)

crates/std_detect/src/detect/cache.rs

Amanieu · 2020-09-15T15:08:54Z

I think you can save an additional instruction by using 0 instead of -1 for the uninitialized state. At least ARM/AArch64 and RISC-V have specific instructions for "branch if zero/non-zero".

thomcc · 2020-09-17T00:08:17Z

New asm:

x86_64-unknown-linux-gnu:

have_sse42:
    push    rax
    mov     rax, qword, ptr, [rip, +, std_detect::detect::cache::CACHE@GOTPCREL]
    mov     rax, qword, ptr, [rax]
    test    rax, rax
    je      .LBB0_1
    shr     eax, 11
    and     eax, 1
    pop     rcx
    ret
.LBB0_1:
    call    qword, ptr, [rip, +, std_detect::detect::cache::detect_and_initialize@GOTPCREL]
    shr     eax, 11
    and     eax, 1
    pop     rcx
    ret

aarch64-pc-windows-msvc:

have_crypto:
    str     x30, [sp, #-16]!
    adrp    x8, std_detect::detect::cache::CACHE
    ldr     x8, [x8, std_detect::detect::cache::CACHE]
    ldr     x0, [x8]
    cbz     x0, .LBB0_2
.LBB0_1:
    ubfx    x0, x0, #6, #1
    ldr     x30, [sp], #16
    ret
.LBB0_2:
    bl      std_detect::detect::cache::detect_and_initialize
    b       .LBB0_1

So yes this saves an instruction on aarch64 (not 32-bit arm it seems, which it looks like still needs an explicit cmp r0, #0).

But I also think it's a little cleaner and more obvious to use 0 for uninitialized this way, but didn't want to change it for no reason.

Edit:

armv7-unknown-linux-gnueabi with 0 as uninitialized cache

have_neon:
    push    {r11, lr}
    ldr r0, .LCPI0_0
.LPC0_0:
    ldr r0, [pc, r0]
    ldr r0, [r0]
    cmp r0, #0
    beq .LBB0_2
    and r0, r0, #1
    pop {r11, pc}
.LBB0_2:
    bl  std_detect::detect::cache::detect_and_initialize
    and r0, r0, #1
    pop {r11, pc}
.LCPI0_0:
.Ltmp0:
    .long   std_detect::detect::cache::CACHE(GOT_PREL)-((.LPC0_0+8)-.Ltmp0)

armv7-unknown-linux-gnueabi with usize::MAX as uninitialized cache

have_neon:
    push    {r11, lr}
    ldr r0, .LCPI0_0
.LPC0_0:
    ldr r0, [pc, r0]
    ldr r0, [r0]
    cmn r0, #1
    beq .LBB0_2
    and r0, r0, #1
    pop {r11, pc}
.LBB0_2:
    bl  std_detect::detect::cache::detect_and_initialize
    and r0, r0, #1
    pop {r11, pc}
.LCPI0_0:
.Ltmp0:
    .long   std_detect::detect::cache::CACHE(GOT_PREL)-((.LPC0_0+8)-.Ltmp0)

Optimize std_detect's cache

2f5da0e

rust-highfive assigned Amanieu Sep 14, 2020

Have cache::initialize return the Initializer

d3cbfa5

This allows modifications made to it, such as env overrides, to be handled properly (immediately).

Amanieu reviewed Sep 15, 2020

View reviewed changes

crates/std_detect/src/detect/cache.rs Outdated Show resolved Hide resolved

thomcc added 2 commits September 16, 2020 16:33

Use 0 for uninitialized, and Relaxed for stores.

4a63d66

Ah, whoops, cargo fmt

27efa9f

Amanieu merged commit 291395d into rust-lang:master Sep 17, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Optimize std_detect's caching #908

Optimize std_detect's caching #908

Uh oh!

thomcc commented Sep 14, 2020

Uh oh!

rust-highfive commented Sep 14, 2020

Uh oh!

thomcc commented Sep 14, 2020

Uh oh!

Uh oh!

Amanieu commented Sep 15, 2020

Uh oh!

thomcc commented Sep 17, 2020 •

edited

Loading

Uh oh!

Uh oh!

Optimize std_detect's caching #908

Optimize std_detect's caching #908

Uh oh!

Conversation

thomcc commented Sep 14, 2020

Improving code size / branch prediction

Results

Assembly for x86_64/aarch64

llvm-mca output

Uh oh!

rust-highfive commented Sep 14, 2020

Uh oh!

thomcc commented Sep 14, 2020

Uh oh!

Uh oh!

Amanieu commented Sep 15, 2020

Uh oh!

thomcc commented Sep 17, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

`llvm-mca` output

thomcc commented Sep 17, 2020 •

edited

Loading