convert_text.sh - randomcrap - random crap programs of varying quality
(HTM) git clone git://git.codemadness.org/randomcrap
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
convert_text.sh (691B)
---
1 #!/bin/sh
2 # easy: 33 - 69
3 # intermediate: 71 - 197
4 # hard: 199 - 222
5
6 mkdir -p txt
7 mutool convert -F text -o "txt/easy.txt" woodpecker.pdf 33-69
8 mutool convert -F text -o "txt/intermediate.txt" woodpecker.pdf 71-197
9 mutool convert -F text -o "txt/hard.txt" woodpecker.pdf 199-222
10
11 for n in easy intermediate hard; do
12
13 LC_ALL=C awk '
14 length($0) <= 3 { next; }
15 /solutions/ || /Solutions/ { next; }
16 /^\xef/ { next; }
17 /^[0-9]*[ ]*$/ { next; }
18 {
19 l = $0;
20 gsub("–", "-", l); # normal ASCII dash
21 # trim leading and trailing spaces.
22 gsub("^[ ]*", "", l);
23 gsub("[ ]*$", "", l);
24 print l;
25 }
26 ' < "txt/$n.txt" > "$n.txt"
27 done
28
29 for n in easy intermediate hard; do
30 cat "$n.txt"
31 done > labels.txt
32