#! /bin/sh
# ========================================
# 08 may 1996. change: small changes in help 
# ========================================
een=$1
naam=$1
treshold=10

if [ -z $een ] ; then een="-h"; fi

if [ $een = '-h' ] ; then
   echo "Usage: bigrams [-c n ] filename";
   echo "computes the approx. mutual info for bigrams "
   echo "   file : textfile"
   echo "   -c n : n = bigram treshold (default = 10)"
   echo "Side-effects after program end (to be renamed if you want to keep them):"
   echo "   naam1, naam2 : all word tokens of file in original order"
   echo "   naam3        : all bigrams with frequencies"
   echo "   naam4        : all word types of file 1, sorted & frequencies"
   echo "Bigrams computes the approximate mutual info for bigrams"
   echo "and displays bigrams above a certain frequency treshold with"
   echo "their mutual info : log(f(x,y)/f(x)*f(y)) / log 2"
   echo "Copyright Hans Paijmans 1995"
   exit 0
fi

if [ $1 = '-c' ] ; then
   naam=$3
   treshold=$2
fi

#----------------------------------------------------
sh listwords -r $naam > naam1
tail +2 naam1 > naam2
paste -d " " naam1 naam2 | sort | uniq -c > naam3
sort naam1 | uniq -c > naam4

cat naam4 naam3 | 
awk '
     NF == 2 { f[$2]=$1 } 
     NF == 3 {
     if ($1 >= treshold) {print log(N*$1/(f[$2]*f[$3]))/log(2), $2, $3}
     }

' "N=`wc -l naam4`" "treshold=$treshold" | sort -rn



