create_text_index.sh - randomcrap - random crap programs of varying quality
 (HTM) git clone git://git.codemadness.org/randomcrap
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       create_text_index.sh (620B)
       ---
            1 #!/bin/sh
            2 # Create a search index from text files.
            3 # tokenizes unique keywords into a list per line per file.
            4 # the keywords are lowercased and special characters are stripped.
            5 # the logic for searching should probably strip the same characters and then
            6 # match them.
            7 
            8 for p in "$@"; do
            9         b=$(basename "$p")
           10 
           11         keywords=$(sed -E \
           12                 -e "s@[']@@g" \
           13                 -e 's@[^a-zA-Z0-9]+@ @g' < "$p" | \
           14                 tr ' ' '\n' | \
           15                 tr '[[:upper:]]' '[[:lower:]]' | \
           16                 sed -E \
           17                         -e 's@^[ ]*@@g' \
           18                         -e 's@[ ]*$@@g' \
           19                         -e '/^$/d' | \
           20                         sort | \
           21                         uniq | \
           22                         awk 'length($0) > 2' | \
           23                         tr '\n' ' ')
           24         printf '%s\t%s\n' "$b" "$keywords"
           25 done