#! /bin/sh

# this script computes averages of tags for every separate fragment
# and/or for the complete file.

# input: a tagged file, with the tags preceding the tokens.
# fragments should be identified by 'rec <#>' on a line by itself.


x=$1
y=$2

if [ -z $x ] ;   then
  echo "Usage: tag-averages [-options] tagged_file"
  echo "     A : print date & actual commandline, preceeded by #-sign"
  echo "         will also print the recordnumber (if any) preceeded by #-sign"

  echo "     m : to control the max length of the tag."
  echo "     t : to control output of individual fragments"
  echo "     o : output as  '#-of-occs   tag'"
  echo "         default is 'tag    #-of-occs'"
  exit 2
fi

# setting defaults

M=2;
T=0;
O=0;
A=0;

set -- `getopt Am:to $*`

for i
        do
        case "$i"
        in                 -A) A=1 ; As=" -A"; shift;;
                           -m) M=$2; Ms=" -m"$M; shift; shift;;
                           -t) T=1;  Ts=" -T"; shift;;
                           -o) O=1;  Os=" -o"; shift;;
                           --) shift; break;;
         esac
done

if [ $A = "1" ] ; then
  echo "#" $0 $As $Ms $Ts $Os $1
fi

awk '
BEGIN{
fragment=0;
only_tots=0;
only_fragments=1;
order=1;
}
{
annotate=A;
max=M;
only_tots=T;
order=O;

if ($1=="rec") 
   {   
   for (term in total) 
        {
        if (!only_tots) 
            if (order=0) printf("%2s\t%4s\t%4d\n",fragment, term ,total[term]);
               else printf("%2s\t%4d\t%4s\n",fragment, total[term],term);
        grand_total[term]+=total[term];
        total[term]=0;
        }
   fragment=$2;
   }
   else
   for (i=1;i<NF+1;i++)
      if (index($i,"_"))
        {
        split($i,a,"_");
        tag=substr(a[1],1,max);
        total[tag]++;
        }

}
END{
       if (!only_fragments)
              for (term in grand_total) 
                 {
                 if (!only_tots) printf("tot:\t");
                 if (order=0) printf("%4s\t%4d\n", term,grand_total[term]);
                    else printf("%4d\t%4s\n",grand_total[term],term);
                 }
}
' A=$A M=$M T=$T O=$O $1












