create_text_index.sh - randomcrap - random crap programs of varying quality
(HTM) git clone git://git.codemadness.org/randomcrap
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
create_text_index.sh (620B)
---
1 #!/bin/sh
2 # Create a search index from text files.
3 # tokenizes unique keywords into a list per line per file.
4 # the keywords are lowercased and special characters are stripped.
5 # the logic for searching should probably strip the same characters and then
6 # match them.
7
8 for p in "$@"; do
9 b=$(basename "$p")
10
11 keywords=$(sed -E \
12 -e "s@[']@@g" \
13 -e 's@[^a-zA-Z0-9]+@ @g' < "$p" | \
14 tr ' ' '\n' | \
15 tr '[[:upper:]]' '[[:lower:]]' | \
16 sed -E \
17 -e 's@^[ ]*@@g' \
18 -e 's@[ ]*$@@g' \
19 -e '/^$/d' | \
20 sort | \
21 uniq | \
22 awk 'length($0) > 2' | \
23 tr '\n' ' ')
24 printf '%s\t%s\n' "$b" "$keywords"
25 done