// Defaults  ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

#let dark-black  = rgb(  0,   0,   0)
#let lite-black  = rgb( 16,  16,  16)

#let dark-grey   = rgb( 64,  64,  64)
#let  mid-grey   = rgb(128, 128, 128)
#let lite-grey   = rgb(192, 192, 192)

#let dark-white  = rgb(240, 240, 240)
#let blue-white  = rgb(240, 248, 255)
#let lite-white  = rgb(255, 255, 255)

#let pure-red    = rgb(224,  16,   0)
#let dark-red    = rgb(255,  64,  16)
#let lite-red    = rgb(255,  80,   0)

#let dark-blue   = rgb(  0, 160, 224)
#let  mid-blue   = rgb( 48, 176, 255) // or (0, 160, 224) or (80, 192, 255) or (80, 160, 255)
#let lite-blue   = rgb( 80, 192, 255)

#let dark-orange = rgb(208, 128,  48) // or rgb(240, 144, 64)
#let lite-orange = rgb(240, 144,  64)

#set page(
   width: 16cm,
  height:  9cm,
  margin: (
    x: 0pt,
    y: 0pt,
  )
)

#set par(
  leading: 8pt,
  spacing: 16pt,
  justify: false,
  linebreaks: "optimized"
)

#set text(
  font: "Iosevka Signalis",
  size: 12pt,
  tracking: 0.125pt,
  fallback: false,
  fill: dark-white
)

#set smartquote(enabled: false)

#show heading: set text(
  font: "Roadgeek 2014 Series D",
  size: 18pt,
  spacing: 66.67%,
  fill: dark-red
)

#show heading: set block(
  below: 15pt
)

#show raw: set text(
  font: "Iosevka Signalis Mono",
  tracking: 0pt
)

#show raw.where(block: false): set text(1.00em / 0.8)
#show raw.where(block: true ): set text(0.75em / 0.8)
#show raw.where(block: true ): set par(leading: 5pt)

#set raw(
  theme: "signalis.tmTheme"
)

#show smallcaps: sc => text(
  font: "Iosevka Signalis Md Ex",
  size: 0.75em,
  tracking: 0.0625em,
  spacing: 100% - 0.0625em,
  upper(sc)
)

#let flipped(s) = {
  s.clusters().map(c => box(scale(x: -100%, c))).join()
}

#show raw.where(lang: "demo"): set text(1.0em / 0.8)
#show raw.where(lang: "demo"): r => {
  show regex("@b\d"): it => text(dark-blue, it.text.slice(2))
  show regex("@r\d"): it => text(lite-red,  it.text.slice(2))
  r
}

#set math.mat(delim: "[")
#set math.vec(delim: "[")
#show math.equation: set text(font: "Fira Math")

// Blocks  ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

#let slide(stroke: dark-red, body) = page(
  fill: lite-black,
  background: image("img/background.svg")
)[
  #box(
     width: 100%,
    height: 100%,
    stroke: (
       top: 10pt + stroke,
      rest:  1pt + stroke,
    ),
    outset: (
       top: -5.0pt,
      rest: -0.5pt,
    ),
    inset: (
       top: 20pt,
      rest: 12pt,
    ),
    body
  )
]

#let rubox(body) = box[
  #box(
    stroke: (
       top: 10pt + dark-red,
      rest:  1pt + dark-red,
    ),
    outset: (
       top: -5.0pt,
      rest: -0.5pt,
    ),
    inset: (
         top: 16pt,
      bottom:  8pt,
           x:  8pt,
    ),
    body
  )
]

#let fill-left-label-box(
  background: lite-white,
  foreground: dark-black,
  width: auto,
  heading,
  body
) = layout(size => [
  #block(
    width:
      if width == auto {
        measure(
          width: size.width,
          block(inset: (x: 8pt, y: 8pt), body)
        ).width
      }
      else { width },
    below: 0pt,
    fill: background,
    outset: (
      x: 0pt,
      y: 0pt,
    ),
    inset: (
      x: 2pt,
      y: 2pt,
    ),
    text(
      font: "Roadgeek 2014 Series F",
      size: 10pt,
      tracking: -0.25pt,
      spacing: 50%,
      fill: foreground,
      heading
    )
  )
  #v(-1pt)
  #block(
    width: width,
    above: 0pt,
    stroke: 1pt + background,
    outset: (
      x: -0.5pt,
      y: -0.5pt,
    ),
    inset: (
      x: 8pt,
      y: 8pt,
    ),
    body
  )
])

#let fill-ctr-label-box(
  background: lite-white,
  foreground: dark-black,
  width: auto,
  heading,
  body
) = layout(size => [
  #block(
    width:
      if width == auto {
        measure(
          width: size.width,
          block(inset: (x: 8pt, y: 8pt), body)
        ).width
      }
      else { width },
    below: 2pt,
    fill: background,
    outset: (
      x: 0pt,
      y: 0pt,
    ),
    inset: (
      x: 2pt,
      y: 3pt,
    ),
    align(
      center,
      text(
        font: "Roadgeek 2014 Series F",
        size: 10pt,
        tracking: -0.25pt,
        spacing: 50%,
        fill: foreground,
        heading
      )
    )
  )
  #block(
    width: width,
    above: 2pt,
    stroke: 1pt + background,
    outset: (
      x: -0.5pt,
      y: -0.5pt,
    ),
    inset: (
      x: 8pt,
      y: 8pt,
    ),
    body
  )
])

#let outline-ctr-label-box(
  background: lite-white,
  width: auto,
  heading,
  body
) = layout(size => [
  #block(
    width:
      if width == auto {
        measure(
          width: size.width,
          block(inset: (x: 8pt, y: 8pt), body)
        ).width
      }
      else { width },
    below: 2pt,
    stroke: 1pt + background,
    outset: (
      x: -0.5pt,
      y: -0.5pt,
    ),
    inset: (
      x: 2pt,
      y: 4pt,
    ),
    align(
      center,
      text(
        font: "Roadgeek 2014 Series F",
        size: 10pt,
        tracking: -0.25pt,
        spacing: 50%,
        fill: background,
        heading
      )
    )
  )
  #block(
    width: width,
    above: 2pt,
    stroke: 1pt + background,
    outset: (
      x: -0.5pt,
      y: -0.5pt,
    ),
    inset: (
      x: 8pt,
      y: 8pt,
    ),
    body
  )
])

#let ofllbox(width: auto, heading, body) = fill-left-label-box(
  background: dark-orange,
  foreground: lite-black,
  width: width,
  heading,
  body
)

#let wfllbox(width: auto, heading, body) = fill-left-label-box(
  background: dark-white,
  foreground: dark-black,
  width: width,
  heading,
  body
)

#let rfllbox(width: auto, heading, body) = fill-left-label-box(
  background: dark-red,
  foreground: lite-black,
  width: width,
  heading,
  body
)

#let wfclbox(width: auto, heading, body) = fill-ctr-label-box(
  background: dark-white,
  foreground: dark-black,
  width: width,
  heading,
  body
)
#let rfclbox(width: auto, heading, body) = fill-ctr-label-box(
  background: dark-red,
  foreground: dark-black,
  width: width,
  heading,
  body
)
#let woclbox(width: auto, heading, body) = outline-ctr-label-box(
  background: dark-white,
  width: width,
  heading,
  body
)
#let roclbox(width: auto, heading, body) = outline-ctr-label-box(
  background: dark-red,
  width: width,
  heading,
  body
)

#let rlabel(width: auto, body) = box(
  width: width,
  stroke: 1pt + dark-red,
  outset: (
    x: -0.5pt,
    y:  1.0pt,
  ),
  inset: (
    x: 2.5pt,
    y: 1.5pt,
  ),
  text(
    font: "Roadgeek 2014 Series F",
    size: 10pt,
    tracking: -0.25pt,
    spacing: 50%,
    fill: dark-red,
    body
  )
)

#let wlabel(width: auto, body) = box(
  width: width,
  fill: dark-white,
  outset: (
    x: 0.0pt,
    y: 1.5pt,
  ),
  inset: (
    x: 2.5pt,
    y: 1.5pt,
  ),
  text(
    font: "Roadgeek 2014 Series F",
    size: 10pt,
    tracking: -0.25pt,
    spacing: 50%,
    fill: dark-black,
    body
  )
)

#let note(body) = text(fill: mid-grey, body)
#let term(body) = text(fill: mid-blue, body)
#let   hi(body) = text(fill: lite-red, body)
#let rust(body) = text(fill: lite-orange, body)

#let warning = text(fill: lite-red)[
  \[#text(
    font: "Roadgeek 2014 Series C",
    size: 1.25em,
    "ACHTUNG"
  )\]
]

#set list(marker: term[›], spacing: 8pt)

#show terms: it => {
  let max-width = 0em
  for child in it.children {
    if max-width < measure(child.term).width {
      max-width = measure(child.term).width
    }
  }
  for child in it.children {
    grid(
      columns: 2,
      // gutter: 8pt,
      box(
        width: max-width + 8pt,
        fill: dark-white,
        outset: (y: 1.5pt),
         inset: (y: 1.5pt),
        align(
          center,
          text(
            font: "Roadgeek 2014 Series F",
            size: 10pt,
            tracking: -0.25pt,
            spacing: 50%,
            fill: dark-black,
            child.term
          )
        )
      ),
      box(
        stroke: (left: 1pt + dark-white),
        outset: (y: 1.5pt, left: 0.5pt),
         inset: (left: 6pt),
        child.description
      )
    )
  }
}

#let fringe(body) = [
  #place(horizon+center, dx: -1.6pt)[#text(fill: rgb( 16,  16, 255))[#body]]
  #place(horizon+center, dx: +1.6pt)[#text(fill: rgb(240,   0,   0))[#body]]
  #place(horizon+center, dx: -1.2pt)[#text(fill: rgb( 16,  16, 255))[#body]]
  #place(horizon+center, dx: +1.2pt)[#text(fill: rgb(240,   0,   0))[#body]]
  #place(horizon+center, dx: -0.8pt)[#text(fill: rgb(  0, 224, 255))[#body]]
  #place(horizon+center, dx: +0.8pt)[#text(fill: rgb(255, 240,   0))[#body]]
  #place(horizon+center, dx: -0.4pt)[#text(fill: rgb(  0, 224, 255))[#body]]
  #place(horizon+center, dx: +0.4pt)[#text(fill: rgb(255, 240,   0))[#body]]
  #place(horizon+center,           )[#text(fill: rgb(255, 255, 255))[#body]]
]

// Slides  ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

#page(fill: pure-red)[
  #set par(leading: 5pt, spacing: 0pt)
  #set text(
    font: "Univers LT Std",
    size: 18pt,
    weight: 400,
    stretch: 50%,
    tracking: 0.5pt,
    fill: lite-white
  )
  #align(horizon + center)[
    #text(font: "Noto Serif CJK SC", size: 10pt, weight: 600, lang: "zh")[〇]
    #v(12pt)
    #text(size: 32pt)[
      {
      #text(font: "HanWangMingBlack", size: 28pt, weight: 700, lang: "zh")[零]
      }
    ]
    #v(18pt)
    THIS SLIDE \
    INTENTIONALLY \
    LEFT BLANK.
  ]
]

// Title - - - - - - - - - - - - - - - -

#page(
  fill: lite-black,
  background: image("img/background.svg")
)[
  #set par(leading: 8pt, spacing: 8pt)
  #align(horizon + center)[
    #v(24pt)
    #box(
      stroke: (
           top: 1pt + lite-white,
        bottom: 1pt + lite-white,
          rest: none
      ),
      outset: (
        x:  0pt,
        y: 12pt,
      )
    )[
      #set par(leading: 10pt)
      #text(
        font: "Roadgeek 2014 Series B",
        size: 36pt,
        fill: blue-white,
      )[
        WRITING A TOP-10 \
        CHESS ENGINE IN RUST
      ]
    ]
    #v(31pt)
    #stack(dir: ltr, spacing: 48pt)[
      #image("img/csmr-large.png", height: 17pt)
    ][
      #image("img/krar-large.png", height: 17pt)
    ]
  ]
]

// Authors - - - - - - - - - - - - - - -

#slide[
  = UNIT DESIGNATIONS

  #align(
    horizon + center,
    block(align(left + top)[
      / COSMO: Author of Viridithas (#smallcaps[spcc] rank 9) \
        #note[https://cosmo.tardis.ac]

      / KORA: Author of Expositor (_defunct_)\
        #note[https://typ.dev]
    ])
  )
]

// Content - - - - - - - - - - - - - - -

#slide[
  = PRELIMINARY DEFINITIONS

  A #smallcaps[chess engine] is a program that, given the state
  of a game of chess, attempts to both evaluäte that state and
  recommend a sequence of moves.

  #let sq_size = 1.7em
  #let r(x) = text(fill: dark-red, x)
  #let b(x) = text(fill: dark-blue, x)
  #let f(x) = text(fill: dark-black, x)
  #let l(x) = text(fill: rgb(110, 110, 110), x)
  #let os = 1.7em - 1pt
  #let box-l(x) = box(stroke: 1pt +  mid-grey, height: os, width: os, x)
  #let box-d(x) = box(stroke: 1pt + lite-grey, height: os, width: os, x)
  #align(
    center + horizon,
    grid(
      columns: 2,
      gutter: 16pt,
      align(center + horizon)[
        #set text(1em * 0.75, font: "Iosevka Signalis Mono")
        #grid(
          inset: (x: 3pt, y: 3pt),
          columns: (sq_size,) * 8,
          rows: sq_size,
          align: center + horizon,
          fill: (x, y) => {
            let c = if calc.odd(x + y) { "#131313" } else { "#303030" }
            rgb(c)
          },
          stroke: (x, y) => {
            if x == 0 { (  left: 0.7pt + dark-black) }
            if x == 7 { ( right: 0.7pt + dark-black) }
            if y == 0 { (   top: 0.7pt + dark-black) }
            if y == 7 { (bottom: 0.7pt + dark-black) }
          },
          // @typstyle off
          ..(
              b("R"),b(" "),b("B"),b("Q"),b(" "),b("B"),f(" "),b("R"),
              b("P"),b("P"),b("P"),b(" "),b(" "),b("K"),b("P"),b("P"),
              f(" "),b(" "),b("N"),b(" "),f(" "),b(" "),b(" "),b(" "),
              b(" "),f(" "),b(" "),b("N"),b("P"),b(" "),b(" "),b(" "),
              r(" "),r(" "),r("B"),r(" "),r(" "),r(" "),r(" "),r(" "),
              r(" "),f(" "),r(" "),f(" "),r(" "),r("Q"),r(" "),r(" "),
              r("P"),r("P"),r("P"),r("P"),f(" "),r("P"),r("P"),r("P"),
              r("R"),r("N"),r("B"),r(" "),r("K"),f(" "),r(" "),r("R"),
          )
        )
      ],
      align(center + horizon)[
        ⟨#h(2pt)King to e6, #r[+0.44] in White’s favor#h(2pt)⟩
      ],
    )
  )
]

#slide[
  = THE COMPONENTS OF AN ENGINE

  #v(8pt)

  / STATE: How is the chessboard represented in the program?
  / SEARCH: Constructing and pruning trees of future possibilities_!_
  / EVALUATION: How do we estimate the value of a chess position?
]

#slide[
  = STATE REPRESENTATION

  One must be able to
  - represent the present state of the game
  - query legal moves
  - apply transitions caused by moves
  - determine when & how a game has ended

  #v(8pt)

  #note[This list is highly non-exhaustive.]
]

#slide[
  = TYPES

  We use #term[enum]s to represent colors, pieces, ranks, files...

  #box(inset: (left: 12pt))[
    ```rust
    #[derive(...)]
    #[repr(u8)]
    enum PieceType {
      Pawn   = 0,
      Knight = 1,
      Bishop = 2,
      Rook   = 3,
      Queen  = 4,
      King   = 5,
    }
    ```
  ]
]

#slide[
  = TYPES

  We use #term[enum]s to represent colors, pieces, ranks, files...

  By doing this, we benefit manifold:

  / RECTITUDE: Precisely specifying the valid values for a type eliminates
    swathes of bugs.
    ```rust
    Rank::Third != File::C != PieceType::Knight
    ```
  / CONCISION: Methods on enums are a joy to use.
    ```rust
    Square::F2.relative_to(Color::Black) == Square::F7
    ```
  / ALACRITY: The more constraints we have on our data, the more we can
    optimise.
]

#slide[
  = INDEXING

  Suppose we have a type for White _vs_ Black like so:
  #align(center)[
    ```rust
    enum Color {
      White = 0,
      Black = 1,
    }
    ```
  ]

]
#slide[
  = INDEXING

  Suppose we have a type for White _vs_ Black like so:
  #align(center)[
    ```rust
    enum Color {
      White = 0,
      Black = 1,
    }
    ```
  ]

  // pause

  There are many places where we have something like:
  #align(center)[
    ```rust
    // occupancy : [u64; 2]
    // side : Color
    occupancy[side as usize]
    ```
  ]
]

#slide[
  = INDEXING

  Typing out #term[usize] every time is annoying, and we’d like to avoid
  bounds-checks. Fortunately, we can implement the trait
  #term[std`::`ops`::`Index]#h(1pt)_!_

  #box(inset: (left: 12pt))[
    ```rust
    impl<T> std::ops::Index<Color> for [T; 2] {
      type Output = T;

      fn index(&self, idx : Color) -> &Self::Output {
        unsafe { self.get_unchecked(idx as usize) }
      }
    }
    ```
  ]

  With this, we can now write “occupancy[side]” without worrying \
  about bounds-checks or #term[usize] casts. \
  #note[#emph[Typically, LLVM can be relied upon to elide bounds-checks. \
  Avoid this pattern unless profiling reveals that elision has failed.]]
]

#slide[
  = BITBOARDS

  #columns(2, gutter: 0%, [
    #term[Bitboards] are unsigned 64-bit integers that
    represent _sets of squares_ on the chessboard.

    #v(8pt)

    #align(center)[\{A1, C1, D1\}#h(0.5em)⇌ #h(0.5em) .#h(1pt).#h(1pt).#h(1pt)00001101]

    #v(8pt)

    Conveniently, 64-bits is the size of the machine word on modern hardware.
    // todo(cosmo) above line can be repatterned as
    //   set : word
    // board : cache-line

    #colbreak()
    #v(-8pt)
    #align(center)[
      #set text(font: "Iosevka Signalis Mono")

      #let sq_size = 1.7em

      #grid(
        inset: (x: 3pt, y: 3pt),
        columns: (sq_size,) * 8,
        rows: sq_size,
        align: center + horizon,
        fill: (x, y) => rgb(
          if calc.odd(x + y) { "#131313" } else { "#303030" },
        ),
        stroke: (x, y) => {
          if x == 0 { (left: 0.4pt + black) }
          if x == 7 { (right: 0.4pt + black) }
          if y == 0 { (top: 0.4pt + black) }
          if y == 7 { (bottom: 0.4pt + black) }
        },

        // @typstyle off
        ..(
            56,57,58,59,60,61,62,63,
            48,49,50,51,52,53,54,55,
            40,41,42,43,44,45,46,47,
            32,33,34,35,36,37,38,39,
            24,25,26,27,28,29,30,31,
            16,17,18,19,20,21,22,23,
            08,09,10,11,12,13,14,15,
            00,01,02,03,04,05,06,07,
        ).map(it => if str(it).len() == 1 [0] else [ ] + str(it))
      )
    ]
  ])
]

#slide[
  = BITBOARDS

  Typically one stores a chess position using 8 bitboards.

  #box(inset: (left: 12pt))[
    ```rust
    struct Position {
      sides  : [Bitboard; 2], // occupancy of each side
      pieces : [Bitboard; 6], // occupancy of each piece-type
    }
    ```
  ]

]
#slide[
  = BITBOARDS

  Typically one stores a chess position using 8 bitboards.

  #box(inset: (left: 12pt))[
    ```rust
    struct Position {
      sides  : [Bitboard; 2], // occupancy of each side
      pieces : [Bitboard; 6], // occupancy of each piece-type
    }
    ```
  ]

  // pause

  Then a query like “how many unblocked pawns are there” is merely:

  #box(inset: (left: 12pt))[
    ```rust
    let pawns = sides[Color::White] & pieces[PieceType::Pawn];
    let movable_pawns = pawns & !sides[Color::Black].south_one();
    return pawns.count_ones();
    ```
  ]

  // What might be a pair of nested loops in a naïve implementation is just
  // three array accesses, a handful of bitwise operations, and popcnt. And
  // that might bring us on to popcnt!
]

#slide[
  = ITERATION

  Given a bitboard (a square-set), how would you efficiently loop over
  the squares present in the set?
]

#slide[
  = BIT MANIPULATION
  // https://tearth.dev/posts/performance-of-bit-manipulation-instructions-bmi/

  / TZCNT: Count the number of trailing zeroes.
  / BLSR: Clear the least-significant set bit.
  / POPCOUNT: Count the number of set bits.

  #box(inset: (left: 12pt))[
    ```demo
       tzcnt(101101@r0@r0) == @r2
        blsr(10110@r100) == 0b10110@r000
    popcount(@r10@r1@r10@r100) == @r4
    ```
  ]

  #smallcaps[TZCNT] and #smallcaps[BLSR] can be paired to form efficient loops
  over set bits.
]

#slide[
  = BIT MANIPULATION

  #smallcaps[TZCNT] and #smallcaps[BLSR] can be paired to form efficient loops
  over set bits.

  #box(inset: (left: 12pt))[
    ```rust
    impl Iterator for SquareIter {
      fn next(&mut self) -> Option<Square> {
        if self.value == 0 {
          None
        } else {
          let lsb = self.value.trailing_zeros() as u8;  // TZCNT
          self.value &= self.value - 1;                 // BLSR
          Square::new(lsb)
        }
      }
    }
    ```
  ]
]

#slide[
  = BIT MANIPULATION

  Abstraction in hand, we can loop over square-sets with maximum efficiency_!_

  #box(inset: (left: 12pt))[
    ```rust
    let our_knights = board.pieces[Knight] & board.sides[side];
    for src in our_knights {
        let moves = knight_attacks(src);
        for tgt in moves & !blockers {
            moves.push(Move::new(src, tgt));
        }
    }
    ```
  ]
]

#slide[
  = EVALUÄTION

  Modern chess engines use
  Efficiently Updatable Neural Networks
  #box[#move(dy: 1pt, "(")]#term[#smallcaps[#flipped("NNUE")]]#box[#move(dy: 1pt, ")")].

  The input features of the network are entirely binary, therefore whenever a
  piece moves, we simply add or subtract columns of the embedding matrix to
  generate a new hidden state from an old one.

  #let colr(x) = text(fill: lite-red, $#x$)
  $
    mat(
      dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c;
      dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c;
      dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c;
      dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c;
    ) space dot underbrace(vec(0, 1, 0, 0, 1, 0), "BOARD")
    = vec(a, b, c, d)
    space space space -> space space space
    mat(
      dots.h.c, dots.h.c, colr(e_02), dots.h.c, dots.h.c, dots.h.c;
      dots.h.c, dots.h.c, colr(e_12), dots.h.c, dots.h.c, dots.h.c;
      dots.h.c, dots.h.c, colr(e_22), dots.h.c, dots.h.c, dots.h.c;
      dots.h.c, dots.h.c, colr(e_32), dots.h.c, dots.h.c, dots.h.c;
    ) space dot underbrace(vec(0, 1, colr(1), 0, 1, 0), "BOARD")
    = vec(a, b, c, d)
    + vec(colr(e_02), colr(e_12), colr(e_22), colr(e_32))
  $
]

#slide[
  = OPTIMIZATION OF INFERENCE

  The most expensive part of the engine
  is a 1024 → 32 fully-connected layer.

  $
    underbrace(
      mat(
        arrow.tl, arrow.t, arrow.tr;
        arrow.l, 32 × 1024, arrow.r;
        arrow.bl, arrow.b, arrow.br;
      ), "WEIGHTS"
    )
    space dot
    underbrace(vec(arrow.t, , 1024, , arrow.b), "INPUT")
    =
    underbrace(vec(arrow.t, 32, arrow.b), "OUTPUT")
  $

]
#slide[
  = OPTIMIZATION OF INFERENCE

  The most expensive part of the engine
  is a 1024 → 32 fully-connected layer.

  $
    underbrace(
      mat(
        arrow.tl, arrow.t, arrow.tr;
        arrow.l, 32 × 1024, arrow.r;
        arrow.bl, arrow.b, arrow.br;
      ), "WEIGHTS"
    )
    space dot
    underbrace(vec(arrow.t, , 1024, , arrow.b), "INPUT")
    =
    underbrace(vec(arrow.t, 32, arrow.b), "OUTPUT")
  $


  / QUANTIZATION: We compress the weights and inputs to `i8`.
]
#slide[
  = OPTIMIZATION OF INFERENCE

  The most expensive part of the engine
  is a 1024 → 32 fully-connected layer.

  $
    underbrace(
      mat(
        arrow.tl, arrow.t, arrow.tr;
        arrow.l, 32 × 1024, arrow.r;
        arrow.bl, arrow.b, arrow.br;
      ), "WEIGHTS"
    )
    space dot
    underbrace(vec(arrow.t, , 1024, , arrow.b), "INPUT")
    =
    underbrace(vec(arrow.t, 32, arrow.b), "OUTPUT")
  $


  / QUANTIZATION: We compress the weights and inputs to `i8`.
  / SPARSITY: We train the input activations to be mostly zero.
]
#slide[
  = OPTIMIZATION OF INFERENCE

  The most expensive part of the engine
  is a 1024 → 32 fully-connected layer.

  $
    underbrace(
      mat(
        arrow.tl, arrow.t, arrow.tr;
        arrow.l, 32 × 1024, arrow.r;
        arrow.bl, arrow.b, arrow.br;
      ), "WEIGHTS"
    )
    space dot
    underbrace(vec(arrow.t, , 1024, , arrow.b), "INPUT")
    =
    underbrace(vec(arrow.t, 32, arrow.b), "OUTPUT")
  $

  // pause

  / QUANTIZATION: We compress the weights and inputs to `i8`.
  // pause
  / SPARSITY: We train the input activations to be mostly zero.
  // pause
  / SIMD: We find and index the non-zero activations in parallel.
]

#slide[
  = DEDUPLICATION

  Many copies of an engine may live and play simultaneously inside the same
  machine. Reading so many copies of the network causes cache pressure, \
  so we map the same copy of the weights into all processes.

]
#slide[
  = DEDUPLICATION

  Many copies of an engine may live and play simultaneously inside the same
  machine. Reading so many copies of the network causes cache pressure, \
  so we map the same copy of the weights into all processes.

  // pause

  #box(inset: (left: 12pt))[
    ```rust
    fn weights() -> &'static Network {
      static MAP : OnceLock<Mmap> = OnceLock::new();
      MAP.get_or_init(|| {
        if !network_path.exists() {
          decompress_network_into(&network_path)
        }
        unsafe { Mmap::map(network_path) }
      })  // transmutation and inter-process coördination elided
    }
    ```
  ]
]

#slide[
  = THE SAFE ELIMINATION OF BOUNDS CHECKS

  #align(
    center,
    wfclbox("TASK")[Sort a list of moves by quality, best-to-worst.]
  )

  #align(
    center,
    ```rust
    moves.sort_by_key(|m| {
      (exchange_winning(m), history(m)) // (bool, i32)
    });
    ```
  )

]
#slide[
  = THE SAFE ELIMINATION OF BOUNDS CHECKS

  #align(
    center,
    wfclbox("TASK")[Sort a list of moves by quality, best-to-worst.]
  )

  #align(
    center,
    ```rust
    moves.sort_by_key(|m| {
      (exchange_winning(m), history(m)) // (bool, i32)
    });
    ```
  )


  #v(16pt)

  / REFINEMENT A: We usually only need the first 1–2 elements of the sorted
    list.
]
#slide[
  = THE SAFE ELIMINATION OF BOUNDS CHECKS

  #align(
    center,
    wfclbox("TASK")[Sort a list of moves by quality, best-to-worst.]
  )

  #align(
    center,
    ```rust
    moves.sort_by_key(|m| {
      (exchange_winning(m), history(m)) // (bool, i32)
    });
    ```
  )


  #v(16pt)

  / REFINEMENT A: We usually only need the first 1–2 elements of the sorted
    list.
  / REFINEMENT B: The sorting key is dominated by exchange_winning.
]
#slide[
  = THE SAFE ELIMINATION OF BOUNDS CHECKS

  #align(
    center,
    wfclbox("TASK")[Sort a list of moves by quality, best-to-worst.]
  )

  #align(
    center,
    ```rust
    moves.sort_by_key(|m| {
      (exchange_winning(m), history(m)) // (bool, i32)
    });
    ```
  )

  // pause

  #v(16pt)

  / REFINEMENT A: We usually only need the first 1–2 elements of the sorted
    list.
  // pause
  / REFINEMENT B: The sorting key is dominated by exchange_winning.
  // pause
  / REFINEMENT C: Evaluating exchange_winning is very costly.
]

#slide[
  = THE SAFE ELIMINATION OF BOUNDS CHECKS

  ```rust
  // select the maximal element each time around the loop
  while let Some((idx, max)) = moves.iter()
      .enumerate()
      .max_by_key(|(_, m)| history(m)) {
    // max is only the max-history move prior
    if !exchange_winning(max) { set_bad(max); continue; }
    emit(max);
    moves.swap(0, idx); //    :(
    moves = &mut moves[1..];
  }
  ```
  // :)

]
#slide[
  = THE SAFE ELIMINATION OF BOUNDS CHECKS

  ```rust
  // select the maximal element each time around the loop
  while let Some((idx, max)) = moves.iter()
      .enumerate()
      .max_by_key(|(_, m)| history(m)) {
    // max is only the max-history move prior
    if !exchange_winning(max) { set_bad(max); continue; }
    emit(max);
    moves.swap(0, idx); //    :(
    moves = &mut moves[1..];
  }
  ```
  // :)

  // pause

  Frustratingly, moves.swap(0, idx) emits a bounds-check for idx.
]

#slide[
  = THE SAFE ELIMINATION OF BOUNDS CHECKS

  Ideally, we'd just write through our max reference#h(2pt)—#h(2pt)we
  could just mem`::`swap it with element zero.

  #box(inset: (left: 12pt))[
    ```rust
    while let Some(max) = moves.iter_mut().max_by_key(history) {
      if !exchange_winning(max) { set_bad(max); continue; }
      emit(max);
      if done() { break; }
      std::mem::swap(&mut moves[0], max);
      //             ^^^^^^^^^^^^^ fails to borrow-check
      moves = &mut moves[1..];
    }
    ```
  ]

]
#slide[
  = THE SAFE ELIMINATION OF BOUNDS CHECKS

  Ideally, we'd just write through our max reference#h(2pt)—#h(2pt)we
  could just mem`::`swap it with element zero.

  #box(inset: (left: 12pt))[
    ```rust
    while let Some(max) = moves.iter_mut().max_by_key(history) {
      if !exchange_winning(max) { set_bad(max); continue; }
      emit(max);
      if done() { break; }
      std::mem::swap(&mut moves[0], max);
      //             ^^^^^^^^^^^^^ fails to borrow-check
      moves = &mut moves[1..];
    }
    ```
  ]

  // pause

  This doesn’t work#h(2pt)—#h(2pt)max is uniquely borrowing moves, so
  “\&mut moves[0]” fails to borrow-check.
]

#slide[
  = CELLS

  _A mutable memory location._

  #box(inset: (left: 12pt))[
    ```rust
    #[repr(transparent)]
    pub struct Cell<T : ?Sized> {
      value : UnsafeCell<T>,
    }

    impl<T> Cell<T> {
      pub fn new(value : T) -> Cell<T>;
      pub fn get(&self) -> T where T : Copy;
      pub fn set(&self, value : T);
      //         ^^^^^ a shared borrow!
    }
    ```
  ]
]

#slide[
  = THE SLICE-CELL TRICK

  ```rust
  //  &mut [T] -> &Cell<[T]> -> &[Cell<T>]
  let mut moves = Cell::from_mut(&mut moves).as_slice_of_cells();

  while let Some(max) = moves.iter().max_by_key(|m| history(m.get())) {
    if !exchange_winning(max) { set_bad(max); continue; }
    emit(max);
    if done() { break; }
    Cell::swap(&moves[0], max);
    moves = &moves[1..];
  }
  ```

  Because we can make multiple _shared_ borrows of moves, all is well in the
  world, and this routine emits no bounds-checks.
]

#slide[
  = THE TRACK MACRO

  A lot goes on inside a chess engine_!_

  #box(inset: (left: 12pt))[
    ```rust
    // what’s this value on average?
    if cheap_heuristic(...) > 42 {
      // how often does this fire?
      if expensive_heuristic(...) {
        return;
      }
    }
    ```
  ]
]

#slide[
  = THE TRACK MACRO

  With #term[macros], one can trace and aggregate statistics in an _ad-hoc_ manner.

]
#slide[
  = THE TRACK MACRO

  With #term[macros], one can trace and aggregate statistics in an _ad-hoc_ manner.

  // pause

  #box(inset: (left: 12pt))[
    ```rust
    macro_rules! track {
      ($name:expr; $v:expr) => {{
        #[distributed_slice(TRACKED_VALUES)]
        static ENTRY : TrackedValue = TrackedValue::new(const { $name });
        let value = $v;
        ENTRY.record(value as i64);
        value
      }};
      ($v:expr) => {{ track!(stringify!($v); $v) }};
    }
    ```
  ]
]

#slide[
  = DISTRIBUTED SLICE

  The #term[linkme] crate provides the \#\[distributed_slice\] macro, \
  which is _incredible_.

  #box(inset: (left: 12pt))[
    ```rust
    #[distributed_slice]
    static CONSTANTS : [i32];

    #[distributed_slice(CONSTANTS)] // foo.rs
    static A : i32 = 45;
    #[distributed_slice(CONSTANTS)] // bar.rs
    static B : i32 = 20;
    #[distributed_slice(CONSTANTS)] // baz.rs
    static C : i32 = 12;

    // main.rs
    println!("{:?}", CONSTANTS); // -> [20, 45, 12]
    ```
  ]
]

#slide[
  = THE TRACK MACRO

  The track#emph[!] macro behaves just like the identity function_!_

  #box(inset: (left: 12pt))[
    ```rust
    if track!(cheap_heuristic(...)) > 42 {
      if track!(expensive_heuristic()) {
        return;
      }
    }
    ```
  ]
]

#page(
  fill: lite-black,
  background: image("img/background.svg"),
  margin: (x: 0pt, top: 0pt, bottom: 0pt),
  image("img/track-macro-output.svg", width: 100%),
)

#slide[
  = SHARED CACHE

  _Or the “transposition table”, in traditional parlance._ \
  #note[
    (_The size of the transposition table is also known as the “hash size”,_ \
    _despite the fact that the data structure is not a hash map but a cache._)
  ]

  The cache is the only means by which threads communicate during search.

  #grid(
    columns: 2,
    gutter: 48pt,
    [```rust
    #[repr(C)]
    struct CacheEntry {
      tag   : u16,
      move  : Option<Move>,
      eval  : i16,
      score : i16,
      depth : u8,
      meta  : PackedMetadata,
    }
    ```],
    [```rust
    #[repr(C, align(32))]
    struct CacheSet {
      entries : [CacheEntry; 3],
      padding : [u8; 2],  // to avoid UB
    }

    struct Cache {
      sets : Vec<CacheSet>
    }
    ```]
  )
]

#slide[
  = SHARED CACHE

  #grid(
    columns: 2,
    gutter: 48pt,
    [```rust
    #[repr(C)]
    struct CacheEntry {
      tag   : u16,
      move  : Option<Move>,
      eval  : i16,
      score : i16,
      depth : u8,
      meta  : PackedMetadata,
    }
    ```],
    [```rust
    #[repr(C, align(32))]
    struct CacheSet {
      entries : [CacheEntry; 3],
      padding : [u8; 2],  // to avoid UB
    }

    struct Cache {
      sets : Vec<CacheSet>
    }
    ```]
  )

  ```rust

  fn derive_index_tag(num_sets : u64, key : u64) -> (usize, u16)
  ```
]

#slide[
  = ACCESSING THE CACHE

  We’ve two options:

  - Guard access with locks. \
    #note[#emph[The compiler can optimize our lookup into a single 256-bit
    load if that’s better, but the performance penalty from obtaining a lock
    is unacceptable] (#emph[or at least unnecessary, however slight it may
    be#h(2pt)—#h(2pt)hence unacceptable]).]

]
#slide[
  = ACCESSING THE CACHE

  We’ve two options:

  - Guard access with locks. \
    #note[#emph[The compiler can optimize our lookup into a single 256-bit
    load if that’s better, but the performance penalty from obtaining a lock
    is unacceptable] (#emph[or at least unnecessary, however slight it may
    be#h(2pt)—#h(2pt)hence unacceptable]).]

  // pause

  #v(0pt)

  - Use atomics. \
    #note[#emph[Four 64-bit loads, which appears to be as fast as anything
    else in practice.]]
]

#slide[
  = ACCESSING THE CACHE ATOMICALLY

  #box(inset: (left: 12pt))[
    ```rust
    pub fn lookup(&self, key : u64) -> CacheEntry
    {
      let (index, tag) = derive_index_tag(self.num_sets, key);
      let b0 = self.sets[index].block[0].load(Relaxed);
      let b1 = self.sets[index].block[1].load(Relaxed);
      let b2 = self.sets[index].block[2].load(Relaxed);
      let b3 = self.sets[index].block[3].load(Relaxed);
      let cache_set = unsafe {
        transmute::<[u64; 4], CacheSet>([b0, b1, b2, b3])
      };
      for entry in cache_set.entries {
        if entry.tag == tag { return entry; }
      }
      return CacheEntry::ZERO;
    }
    ```
  ]
]

#slide[
  = THE DUALITY OF MEM

  #v(16pt)

  #rfclbox(width: 100%, "FEARLESS CONCURRENCY")[
    Rust is a great systems language because of its powerful safe primitives
    #note[(like Arc, Condvar, Mutex, OnceLock, and so on)].
  ]

]
#slide[
  = THE DUALITY OF MEM

  #v(16pt)

  #rfclbox(width: 100%, "FEARLESS CONCURRENCY")[
    Rust is a great systems language because of its powerful safe primitives
    #note[(like Arc, Condvar, Mutex, OnceLock, and so on)].
  ]

  // pause

  #v(8pt)

  #rfclbox(width: 100%, "FEARFUL CONCURRENCY")[
    Rust is _also_ a great systems language because it takes you seriously as
    a user and lets you create your own primitives_!_ #note[(After all, the
    standard library is largely implemented in ordinary Rust.)]
  ]
]

#slide[
  #align(horizon)[
    #align(center, warning)
    #align(center, emph[
      When you use #term[unsafe], the compiler will not check that \
      soundness is upheld; you are responsible for doing so.
    ])
  ]
]

#page(
  fill: rgb(8, 8, 8),
)[
  #set par(leading: 5pt, spacing: 0pt)
  #set text(
    font: "Univers LT Std",
    size: 18pt,
    weight: 400,
    stretch: 50%,
    tracking: 1pt,
    fill: rgb(192, 192, 192),
  )
  #fringe[
    DU HAST VERSPROCHEN
  ]
]

#page(
  fill: rgb(224, 16, 0),
)[
  #set par(leading: 5pt, spacing: 0pt)
  #set text(
    font: "Univers LT Std",
    size: 18pt,
    weight: 400,
    stretch: 50%,
    tracking: 0.5pt,
    fill: rgb(255, 255, 255),
  )
  #align(horizon + center)[
    REMEMBER YOUR PROMISE
  ]
]

#slide[
  #align(center + horizon)[
    #wfclbox("INJUNCTION")[READ MARA’S BOOK.]
  ]
]

#slide[
  = TORN READS

  The widest atomic type is AtomicU64 (eight bytes), but an entry is ten bytes
  and a set is thirty(-two) bytes, so neither can be accessed atomically.

  Reading an entry requires multiple loads, and in theory, if another thread is
  issuing stores at the same time, part of the data we read might be from the
  ongoing write and part of the data might be from a previous write.

  #wfllbox("NOTE")[
    Even if torn reads weren’t possible, since we store too few tag bits to
    recover the key, we have to handle entries that appear valid but are in
    fact invalid. #note[(There’s also the very small chance of key collisions.)]
  ]
]

#slide[
  = TORN READS

  We don’t have to mitigate this, amazingly enough.
  - It’s safe because all bit patterns are valid.
  - The performance\* impact is small (but enough that it’s worth countering).

  #note[#emph[Why might this be so? We already expect the engine to be mistaken
  in its evaluätion of positions, and reading an incorrect entry is simply
  another misevaluätion, which merely degrades the quality of analysis but
  does not threaten algorithmic correctness.]]

  We can reject the vast majority of incorrect lookups by checking the legality
  of the stored move. #note[#emph[This guards against torn reads, tag aliasing,
  and key collisions, but imperfectly#h(2pt)—#h(2pt)a move might be legal in
  two colliding positions.]]

  #align(bottom, text(size: 8pt)[
    \*#h(0.25em)#note[Game-playing strength measured in Elo rating points.]
  ])
]

#slide[
  = ATOMIC PER BYTE

  RFC #strong[3301] proposes an “atomic memcpy” that permits tearing.

  #note[
    #emph[It might also provide the means to read uninitialized memory, allowing
    more performant implementations of data structures like sparse sets.]
  ]
]

#slide[
  = RUST THROUGH THE AGES

  We both started using Rust in January 2021. What’s changed since then?

  #grid(
    columns: (3em, auto),
    column-gutter: 10pt,
    row-gutter: 7pt,
    align: (right, auto),
    text(fill: dark-blue, "1.51"), [const generics · {integer}`::`unsigned_abs],
    text(fill: dark-blue, "1.58"), [identifiers in format strings],
    text(fill: dark-blue, "1.59"), [inline assembly · destructuring assignments],
    text(fill: dark-blue, "1.60"), [{integer}`::`abs_diff],
    text(fill: dark-blue, "1.65"), [let-else statements · break in labelled blocks],
    text(fill: dark-blue, "1.79"), [inline const expressions],
    text(fill: dark-blue, "1.82"), [raw pointer syntax],
    text(fill: dark-blue, "1.87"), [{integer}`::`is_multiple_of · {integer}`::`midpoint],
    text(fill: dark-blue, "1.88"), [let chains],
    text(fill: dark-blue, "1.92"), [Box`::`new_zeroed], // show example
    text(fill: dark-blue, "1.93"), [[T]`::`as_array],
  )
]

// Final - - - - - - - - - - - - - - - -

#page(fill: dark-black)[  // rgb(192, 192, 200)
  #set par(leading: 7.5pt, spacing: 0pt)
  #set text(
    font: "Univers LT Std",
    size: 27pt,
    weight: 400,
    stretch: 50%,
    tracking: 1pt,
    fill: lite-white, // rgb(248, 248, 255)
  )
  #fringe[
    #text(font: "Noto Serif CJK SC", size: 15pt, weight: 600, lang: "zh")[△] // or □ or ☖?
    #v(18pt)
    #text(size: 48pt)[
      {
      #box[#move(dx: 2pt, dy: 4pt)[
        #text(font: "Noto Serif CJK SC", size: 42pt, weight: 700, lang: "zh")[终] // or 完?
      ]]
      }
    ]
    #v(27pt)
    THIS SLIDE \
    INTENTIONALLY \
    LEFT BLANK.
  ]
]
