{ CS 310 - Data Structures - Big Program #1 - By Matt Martini }
Program Big_Prog_One(input,output);
 
CONST maxstrg = 20;        {maximum word length}
      minthreshold = 0.5;  {alternate 0.75}
 
TYPE  word = string[maxstrg];               {a word of text}
 
      ptr = ^wordptr;                       {record pointer}
      wordptr = RECORD
                  txt     : word;                {a word of text}
                  count   : INTEGER;             {frequency of word}
                  nextwrd : ptr                  {link to next word}
                END;
 
      alpharray = array['a'..'z'] of ptr;   {array of pointers to}
                                            {linked list of records}
 
VAR
     infile     : text;                     {input file}
     outfile    : text;                     {output file}
     filename   : string[20];               {name of file}
     stop_words : alpharray;                {common words}
     go_words   : alpharray;                {interesting words}
     aword      : word;                     {a word}
     thiswrd    : ptr;                      {scratch pointer}
     prev       : ptr;                      {scratch pointer before thiswrd}
     found      : boolean;                  {word search successful}
 
 
 
{Clear all global variables and arrarys to nil.}
Procedure INITALIZE;
VAR c  : char;
BEGIN
  FOR c := 'a' TO 'z' DO
      BEGIN
        go_words[c]    := NIL;
        stop_words[c]  := NIL;
      END;
  aword   := '';
  prev    := NIL;
  thiswrd := NIL;
  found   := FALSE;
END; {initalize}
 
 
{Return the first letter of word.}
Function  FIRST_LET(wrd:word):char;
BEGIN
  first_let := copy(wrd,1,1)
END; {first_let}
 
 
{Read next word from from infile.}
Procedure READWORD( var infile:text; var token:word);
VAR   ch     : char;
      neword : string[100];
      alpha  : SET OF char;
 
   {Turn an uppercase letter into lowercase.}
   Procedure TRIM_UPPER_CASE(var UPch:char);
   VAR big_alpha : SET OF char;
   BEGIN
     big_alpha := ['A'..'Z'];               {if capital - make it small}
     IF UPch IN big_alpha
        THEN UPch := chr(ord(UPch)-ord('A')+ord('a'))
   END; {trin_upper_case}
 
BEGIN
   alpha := ['a'..'z','A'..'Z'];
   neword := '';
   IF NOT EOF(infile)                  {don't try to read past end of file}
    THEN
      BEGIN
        READ(infile,ch);                    {read the first charachter}
        {if ch is a delimiter read charaters until a letter is found}
        WHILE (NOT EOF(INFILE)) AND (NOT(ch IN alpha)) DO
          READ(infile,ch);
        {if ch is a letter read the rest of the word}
        WHILE (NOT EOF(infile)) AND (ch IN alpha) DO
          BEGIN
            trim_upper_case(ch);
            neword := concat(neword,ch);
            READ(infile,ch)
          END;
        {get last letter of last word}
        IF (EOF(infile)) AND (ch IN alpha)
           THEN
             BEGIN
               trim_upper_case(ch);
               neword := concat(neword,ch)
             END;
        {truncate longwords to 20 chars.}
        IF length(neword) <= maxstrg
           THEN  token := neword
           ELSE  token := copy(neword,1,20);
      END;
END; {readword}
 
 
{Insert a record containing token into its alphabetic position}
{in the linked list pointed to by which.}
Procedure INSERT(var which:alpharray; token:word);
VAR  fl : char;
     pre, here, temp : ptr;  {scratch pointers}
BEGIN
  pre  := NIL;                    {initalize variables}
  here := NIL;
  temp := NIL;
  fl := first_let(token);         {get first letter of token}
  IF which[fl] = NIL              {no words in list with this first letter}
     THEN
       BEGIN
         NEW(which[fl]);               {make a record for this word}
         which[fl]^.txt     := token;
         which[fl]^.count   := 1;
         which[fl]^.nextwrd := NIL
       END
     ELSE                       {list of words already exists - put word in it}
       BEGIN
         here := which[fl];
         REPEAT                        {travel list to find correct spot}
           pre := here;
           here := here^.nextwrd;
         UNTIL (here = NIL) OR (here^.txt > token);
         IF pre^.txt = token                {is the word alreay in the list?}
            THEN pre^.count := pre^.count + 1    {yes - update count}
            ELSE                                 {no  - put it in the list}
              IF which[fl]^.txt > token
                 THEN                  {should this word be first on the list?}
                   BEGIN
                     NEW(temp);             {yes - put it first}
                     temp^.txt   := token;
                     temp^.count := 1;
                     temp^.nextwrd := which[fl];
                     which[fl] := temp
                   END
                 ELSE
                   BEGIN
                     NEW(temp);             {no - put it in the correct spot}
                     temp^.txt   := token;
                     temp^.count := 1;
                     temp^.nextwrd := here;
                     pre^.nextwrd := temp
                   END;
       END; {else}
END; {insert}
 
 
 
{Find a word in which and returns a pointer to the word and a pointer}
{to the word before it (nil if word is the first in the list) and    }
{a flag to see if it was found.}
Procedure FINDCELL(which:alpharray;token:word;
                   var prev,thiswrd:ptr;var found:boolean);
VAR  fl    : char;
     pre, here : ptr;
BEGIN
  pre   := NIL;  prev    := NIL;  {initalize variables}
  here  := NIL;  thiswrd := NIL;
  found := FALSE;
  fl := first_let(token);         {find the first letter of token}
  IF which[fl] <> NIL             {don't search if a list with this}
     THEN                               {first letter doesn't exist}
       BEGIN
         found := FALSE;
         here := which[fl];                 {set to first word in list}
         IF here^.txt = token               {is this the token?}
            THEN found := TRUE                   {yes}
            ELSE                                 {no - look for word}
              REPEAT                        {travel list to find this word}
                pre := here;
                here := here^.nextwrd;
                IF here^.txt = token THEN found := TRUE
              UNTIL (here = NIL) OR (found);
         IF found
            THEN                  {word is found in the list}
              BEGIN
                prev    := pre;        {set pointers to word and previous word}
                thiswrd := here
              END
            ELSE                  {word is not in list}
              BEGIN
                prev    := NIL;        {return nil}
                thiswrd := NIL
              END{found}
       END;
END; {findcell}
 
 
{Output all words and their frequencies in alphbetical order.}
Procedure WRITEWORDS(which:alpharray);
VAR  c       :char;
     whatwrd : ptr;
BEGIN
  whatwrd := NIL;
  FOR c := 'a' TO 'z' DO          {travel thru array of linked lists}
      BEGIN
        whatwrd := which[c];           {get root record}
        IF whatwrd <> NIL
           THEN
             REPEAT               {travel thru linked list outputing words}
               WRITELN(outfile,whatwrd^.count:3,'  --> ',whatwrd^.txt);
               whatwrd := whatwrd^.nextwrd
             UNTIL whatwrd = NIL;
      END;
END; {writewords}
 
 
{Conflate words with the same stem}
Procedure CONFLATE(which:alpharray; minthres: real);
VAR c : char;
    threshold  : real;
    wrd, prewrd : word;
    here, pre : ptr;
 
   {Return the threshold value for a pair of words.}
   Function con_thres(token,pred:word):real;
   VAR a, b, m, i : integer;
   BEGIN
     a := length(token);
     b := length(pred);
     i := 0;
     REPEAT                  {find the number of matching letters}
       i := i + 1;
     UNTIL (copy(token,1,i) <> copy(pred,1,i)) OR (i > a) OR (i > b);
     m := i - 1;
     con_thres := 2*m/(a+b);
   END; {con_thres}
 
   {Combine words occording to i. minimum length
                              ii. maximum frequency
                             iii. alphabetically.}
   Procedure combine_words;
   VAR thislen, prevlen : integer;
 
      {dispose of a record pointed to by thiswrd while maintaining list}
      Procedure del_thiswrd;
      BEGIN
        prev^.nextwrd := thiswrd^.nextwrd;
        prev^.count   := prev^.count + thiswrd^.count;
        here := prev;
        dispose(thiswrd)
      END;
 
      {dispose of the record before the one pointed}
      {to by thiswrd while maintaining list}
      Procedure del_prev;
      BEGIN
        thiswrd^.count := thiswrd^.count + prev^.count;
        {get a pointer to the record before prev}
        findcell(which,prev^.txt,prev,thiswrd,found);
        {get out your scorecard - the names of the players have changed}
        {thiswrd now points to the old prev, and prev points to the word}
        {before that, here is maintained at the old thiswrd}
        IF (found) AND (prev <> NIL)
           THEN prev^.nextwrd := here
           ELSE go_words[c] := here;
        dispose(thiswrd)
      END;
 
   BEGIN
     thislen := length(thiswrd^.txt);
     prevlen := length(prev^.txt);
     IF thislen > prevlen                   { i. select on minimum length}
        THEN del_thiswrd
        ELSE
          IF thislen < prevlen
             THEN del_prev
             ELSE
               BEGIN
                 IF thiswrd^.count < prev^.count { ii. select on max frequency}
                    THEN del_thiswrd
                    ELSE
                      IF thiswrd^.count > prev^.count
                         THEN del_prev
                         ELSE                    { iii. select alphabetically}
                           BEGIN
                             IF thiswrd^.txt > prev^.txt
                                THEN del_thiswrd
                                ELSE del_prev;
                           END;
               END;
   END; {combine_words}
 
BEGIN
  FOR c := 'a' TO 'z' DO          {travel thru array of lists}
      BEGIN
        here := which[c];         {if a list of this first letter exists}
        IF here <> NIL            {travel along list}
           THEN
             REPEAT
               thiswrd := NIL;
               prev    := NIL;
               findcell(which,here^.txt,prev,thiswrd,found); {get pointers}
               IF (found) AND (prev <> NIL)      {don't work on root word}
                  THEN
                    BEGIN
                      {if words meet criteria combine them}
                      threshold := con_thres(thiswrd^.txt,prev^.txt);
                      IF threshold > minthres THEN combine_words;
                    END;
               here := here^.nextwrd
             UNTIL here = NIL;
      END; {for}
END; {conflate}
 
 
 
BEGIN {Big_Prog_One}
  initalize;                           {initalize variables and arrays}
  filename := 'CON:';                  {open output file to console}
                                       {alternate output file OUT.TXT}
  assign(outfile,filename);
  rewrite(outfile);
  filename := 'STOPWORD.TXT';          {open file stopword.txt}
  assign(infile,filename);
  reset(infile);
  aword := '';
  readword(infile,aword);              {read stop-words into array stop_words}
  WHILE NOT EOF(infile) DO
    BEGIN
      insert(stop_words,aword);
      readword(infile,aword)
    END;
  close(infile);                       {close file stopwords.txt}
  filename := 'GOWORD.TXT';            {open file goword.txt}
  assign(infile,filename);
  reset(infile);
  aword := '';
  readword(infile,aword);              {read all words - if not in stop_words}
  WHILE NOT EOF(infile) DO             {put it into array go_words}
    BEGIN
      found := FALSE;
      findcell(stop_words,aword,prev,thiswrd,found);
      IF NOT found THEN insert(go_words,aword);
      readword(infile,aword)
    END;
  close(infile);                       {close file goword.txt}
  writeln(outfile,'CS 310-Data Structures-Big Program #1-By Matt Martini');
  writeln(outfile,'The interesting words:');
  writeln(outfile);
  writewords(go_words);                {type out the interesting words and}
                                       {their frequency counts}
  writeln(outfile);
  writeln(outfile,'The conflated list: (with a threshold of 0.5)');
  writeln(outfile);
  conflate(go_words,minthreshold);     {conflate list}
  writewords(go_words);                {output conflated list}
  close(outfile);                      {close output file}
END.


