generate.sh: improve portability and performance, use randomness using a seed - chess-puzzles - chess puzzle book generator
 (HTM) git clone git://git.codemadness.org/chess-puzzles
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 667fa14261d797a2c04938992a4efa5061e558a2
 (DIR) parent b9e10f90912e4d6c82e4a4738a2fcdbd77b0d6db
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Thu, 21 Dec 2023 18:14:27 +0100
       
       generate.sh: improve portability and performance, use randomness using a seed
       
       - Replace shuf with a custom shuffling using awk and sort -R.
       - Use a random seed to generate random, but in a deterministic way.
         Allows regenerating the same output (atleast on the same machine).
       - Generating the puzzles is faster, tested on a machine: 10s to 3.5s
       - Show an error message when the CSV database file doesn't exist yet.
       
       Diffstat:
         M generate.sh                         |      92 +++++++++++++++++++------------
       
       1 file changed, 58 insertions(+), 34 deletions(-)
       ---
 (DIR) diff --git a/generate.sh b/generate.sh
       @@ -1,12 +1,47 @@
        #!/bin/sh
        
        fenbin="./fen"
       +db="lichess_db_puzzle.csv"
       +
       +if ! test -f "$db"; then
       +        printf 'File "%s" not found, run `make db` to update it\n' "$db" >&2
       +        exit 1
       +fi
        
        index="puzzles/index.html"
        rm -rf puzzles
        mkdir -p puzzles/solutions
        
        solutions="$(mktemp)"
       +seedfile="$(mktemp)"
       +seed=20231221 # must be a integer value
       +# seed for random sorting, makes it deterministic for the same system
       +# seed must be sufficiently long.
       +echo "${seed}_chess_puzzles" > "$seedfile"
       +
       +# shuffle(file, amount)
       +shuffle() {
       +        f="$1"
       +        total="$2"
       +        nlines="$(wc -l < "$f")"
       +        nlines="$((nlines + 0))"
       +        results="$(mktemp)"
       +
       +# generate list of lines to use. Not perfectly random but good enough.
       +LC_ALL=C awk -v "seed=$seed" -v "nlines=$nlines" -v "total=$total" '
       +BEGIN {
       +        srand(seed);
       +        for (i = 0; i < total; i++)
       +                sel[int(rand() * nlines)] = 1;
       +}
       +sel[NR] {
       +        print $0;
       +}' "$f" > "$results"
       +
       +        # now we have less results we can use the slow sort -R.
       +        sort -R --random-source "$seedfile" "$results"
       +        rm -f "$results"
       +}
        
        cat > "$index" <<!
        <!DOCTYPE html>
       @@ -38,42 +73,30 @@ footer {
        !
        
        # shuffle, some sort of order and point system based on rating of puzzle.
       -db="lichess_db_puzzle.csv"
        count=1
        
       -(grep 'mateIn1' < "$db" | shuf -n 100 | sed 10q
       -grep 'mateIn2' < "$db" | shuf -n 100 | sed 10q
       -grep 'mateIn3' < "$db" | shuf -n 100 | sed 10q
       -grep 'mateIn4' < "$db" | shuf -n 100 | sed 10q
       -LC_ALL=C awk -F ',' '(" " $8 " ") ~ / mateIn5 / && int($4) < 2000 { print $0 }' "$db" | shuf -n 100 | sed 5q
       -LC_ALL=C awk -F ',' '(" " $8 " ") ~ / mateIn5 / && int($4) >= 2000 { print $0 }' "$db" | shuf -n 100 | sed 3q
       -LC_ALL=C awk -F ',' '(" " $8 " ") ~ / mateIn5 / && int($4) >= 2700 { print $0 }' "$db" | shuf -n 100 | sed 2q
       -) |
       -LC_ALL=C awk -F ',' '
       -{
       -        points="1 point"; # default
       -}
       -(" " $8 " ") ~ / mateIn2 / {
       -        points="2 points";
       -}
       -(" " $8 " ") ~ / mateIn3 / {
       -        points="3 points";
       -}
       -(" " $8 " ") ~ / mateIn4 / {
       -        points="4 points";
       -}
       -(" " $8 " ") ~ / mateIn5 / && int($4) < 2000 {
       -        points="5 points";
       -}
       -(" " $8 " ") ~ / mateIn5 / && int($4) >= 2000 {
       -        points="7 points";
       -}
       -(" " $8 " ") ~ / mateIn5 / && int($4) >= 2700 {
       -        points="10 points";
       -}
       -{
       -        print $0 "," points;
       -}' | \
       +groupsdir="$(mktemp -d)"
       +test "$groupsdir" = "" && exit 1
       +
       +grep 'mateIn1' "$db" > "$groupsdir/matein1.csv"
       +grep 'mateIn2' "$db" > "$groupsdir/matein2.csv"
       +grep 'mateIn3' "$db" > "$groupsdir/matein3.csv"
       +grep 'mateIn4' "$db" > "$groupsdir/matein4.csv"
       +grep 'mateIn5' "$db" > "$groupsdir/matein5.csv"
       +LC_ALL=C awk -F ',' 'int($4) < 2000 { print $0 }' "$groupsdir/matein5.csv" > "$groupsdir/matein5_lt_2000.csv"
       +LC_ALL=C awk -F ',' 'int($4) >= 2000 { print $0 }' "$groupsdir/matein5.csv" > "$groupsdir/matein5_ge_2000.csv"
       +LC_ALL=C awk -F ',' 'int($4) >= 2700 { print $0 }' "$groupsdir/matein5.csv" > "$groupsdir/matein5_ge_2700.csv"
       +
       +(
       +shuffle "$groupsdir/matein1.csv" 100 | sed 10q | LC_ALL=C awk '{ print $0 ",1 point" }'
       +shuffle "$groupsdir/matein2.csv" 100 | sed 10q | LC_ALL=C awk '{ print $0 ",2 points" }'
       +shuffle "$groupsdir/matein3.csv" 100 | sed 10q | LC_ALL=C awk '{ print $0 ",3 points" }'
       +shuffle "$groupsdir/matein4.csv" 100 | sed 10q | LC_ALL=C awk '{ print $0 ",4 points" }'
       +shuffle "$groupsdir/matein5_lt_2000.csv" 100 | sed 5q | LC_ALL=C awk '{ print $0 ",5 points" }'
       +shuffle "$groupsdir/matein5_ge_2000.csv" | sed 3q | LC_ALL=C awk '{ print $0 ",7 points" }'
       +shuffle "$groupsdir/matein5_ge_2700.csv" | sed 2q | LC_ALL=C awk '{ print $0 ",10 points" }'
       +rm -rf "$groupsdir"
       +) | \
        while read -r line; do
                i="$count"
                fen=$(printf '%s' "$line" | cut -f 2 -d ',')
       @@ -196,3 +219,4 @@ cat >> "$index" <<!
        !
        
        rm -f "$solutions"
       +rm -f "$seedfile"