/* $Id: lex.c,v 1.3 2007-11-23 23:14:47 kiesling Exp $ */

/*
  This file is part of ctalk.
  Copyright  2005-2007  Robert Kiesling, rkiesling@users.sourceforge.net.
  Permission is granted to copy this software provided that this copyright
  notice is included in all source code modules.

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software Foundation, 
  Inc., 51 Franklin St., Fifth Floor, Boston, MA 02110-1301 USA.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include "ctpp.h"
#include "typeof.h"
#include "prtinfo.h"

/* TO DO -
   Use the international abbreviations in iso646.h.
*/


extern int error_line;           /* Declared in errorloc.c. */
extern int error_column;

extern RT_INFO rtinfo;           /* Declared in rtinfo.c.   */
extern char source_file[FILENAME_MAX];
extern int keepcomments_opt;

extern int line_info_line;       /* Declared in lineinfo.c. */

#if defined(__CYGWIN32__) || defined(__DJGPP__)
extern int input_size;             /* Declared in lib/read.c.            */
#else
extern off_t input_size; 
#endif

extern int warnnestedcomments_opt;  /* Declared in rtinfo.c. */
extern int use_trigraphs_opt;
extern int warn_trigraphs_opt;
extern char source_file[FILENAME_MAX];

int in_c_cmt, in_cplus_cmt;         /* Record if in a comment. */
int linesplice;                     /* Record line splices.    */

#define IS_SIGN(c,d) ((c == '-' || c == '+') && \
		      (isdigit (d) || d == '.'))

int rescanning = FALSE;     /* Set by preprocess.c when rescanning
				arguments. 
			     */

static int gnuc_attribute;  /* Set by __attribute__ label, cleared after )). */

static int preprocess_line = FALSE;  /* Used to determine statement endings -
                                        Set when we encounter a PREPROCESS
					token and cleared at the NEWLINE 
                                        token.  If set, statements end
                                        at the NEWLINE.  Otherwise, statements
                                        end at the SEMICOLON.
				     */
int comment = 0;                      /* Level of comment nesting. */


int lexical (char *buf, long long *idx, MESSAGE *m) {

  int c, i, j;
  int numeric_type;
  char tmpbuf[MAXMSG];
  char *q;

  c = buf[(*idx)++];

  /* WHITESPACE */
  /* Newline characters have their own token type, below. */
  if (c == ' ' || c == '\t' || c == '\f' || c == '\v') {
    sprintf (m -> name, "%c", c);
    for (i = 1;
	 ((buf[*idx] == ' ') || (buf[*idx] == '\t') || 
	  (buf[*idx] == '\f'));
	 i++, (*idx)++)
      m -> name[i] = buf[*idx];
    m -> name[i] = 0;
    m -> tokentype = WHITESPACE;
    return WHITESPACE;
  }

  /* OPENPAREN */
  if (c == '(') {

    /* TO DO -
     *  For now, simply replace the GNUC attributes with whitespace.
     */
    if (gnuc_attribute && buf[*idx] == '(') {
      int n_parens = 2;
      gnuc_attribute = FALSE;
      for (j = *idx + 1; n_parens; j++) {
	if (buf[j] == '(')
	  ++n_parens;
	if (buf[j] == ')')
	  --n_parens;
      }
      *idx = j;
      strcpy (m -> name, " ");
      m -> tokentype = WHITESPACE;
      return WHITESPACE;
    }

    sprintf (m -> name, "%c", c);
    m -> tokentype = OPENPAREN;
    return OPENPAREN; 
  } 

  /* CLOSEPAREN */
  if (c == ')') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = CLOSEPAREN;
    return CLOSEPAREN; 
  } 

  /* OPENBLOCK */
  if (c == '{') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = OPENBLOCK;
    return OPENBLOCK; 
  } 

  /* CLOSEBLOCK */
  if (c == '}') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = CLOSEBLOCK;
    return CLOSEBLOCK; 
  } 

  /* OPENCCOMENT */
  if (c == '/' && buf[*idx] == '*') {
     ++in_c_cmt;
     tmpbuf[0] = c;
     tmpbuf[1] = buf[(*idx)++];
     tmpbuf[2] = 0;
     sprintf (m -> name, "%s", tmpbuf);
     m -> tokentype = OPENCCOMMENT;
     return OPENCCOMMENT;
  }

  /* CLOSECCOMENT */
  if (c == '*' && buf[*idx] == '/') {
    if (--in_c_cmt < 0) {
      _warning ("%s:%d: Unmatched comment close.\n",
		source_file, error_line);
      in_c_cmt = 0;
    }
    tmpbuf[0] = c;
    tmpbuf[1] = buf[(*idx)++];
    tmpbuf[2] = 0;
    sprintf (m -> name, "%s", tmpbuf);
    m -> tokentype = CLOSECCOMMENT;
    return CLOSECCOMMENT;
  }

  /* CPPCOMMENT */
  if (c == '/' && buf[*idx] == '/') {
    ++in_cplus_cmt;
    q = index (&buf[*idx], '\n');
    strncpy (tmpbuf, &buf[(*idx) - 1], q - &buf[(*idx) - 1]);
    tmpbuf[q - &buf[(*idx) - 1]] = 0;
    strcpy (m -> name, tmpbuf);
    *idx += (q - &buf[*idx]);
    m -> tokentype = CPPCOMMENT;
    return CPPCOMMENT;
  }

  /* SEMICOLON */
  if (c == ';') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = SEMICOLON;
    return SEMICOLON;
  }

  /* ASTERISK, MULT_ASSIGN */
  if (c == '*') {
    switch (buf[*idx]) 
      {
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "*=");
	m -> tokentype = MULT_ASSIGN;
	return MULT_ASSIGN;
	break;
      default:
	sprintf (m -> name, "%c", c);
	m -> tokentype = ASTERISK;
	return ASTERISK;
	break;
      }
  }

  /* DIVIDE, DIV_ASSIGN */
  if (c == '/') {
    switch (buf[*idx]) 
      {
      case '=':
	sprintf (m -> name, "%s", "/=");
	++(*idx);
	m -> tokentype = DIV_ASSIGN;
	return DIV_ASSIGN;
	break;
      default:
	sprintf (m -> name, "%c", c);
	m -> tokentype = DIVIDE;
	return DIVIDE;
	break;
      }
  }

  /* LT, ASL, ASL_ASSIGN */
  if (c == '<') {
    switch (buf[*idx])
      {
      case '<':
	(*idx)++;
	switch (buf[*idx])
	  {
	  case '=':
	    (*idx)++;
	    sprintf (m -> name, "%s", "<<=");
	    m -> tokentype = ASL_ASSIGN;
	    return ASL_ASSIGN;
	    break;
	  default:
	    sprintf (m -> name, "%s", "<<");
	    m -> tokentype = ASL;
	    return ASL;
	    break;
	  }
	break;
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "<=");
	m -> tokentype = LE;
	return LE;
	break;
      default:
	sprintf (m -> name, "%s", "<");
	m -> tokentype = LT;
	return LT;
	break;
      }
  }

  /* GT, ASR, ASR_ASSIGN, GE */
  if (c == '>') {
    switch (buf[*idx])
      {
      case '>':
	(*idx)++;
	switch (buf[*idx])
	  {
	  case '=':
	    (*idx)++;
	    sprintf (m -> name, "%s", ">>=");
	    m -> tokentype = ASR_ASSIGN;
	    return ASR_ASSIGN;
	    break;
	  default:
	    sprintf (m -> name, "%s", ">>");
	    m -> tokentype = ASR;
	    return ASR;
	    break;
	  }
	break;
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", ">=");
	m -> tokentype = GE;
	return GE;
	break;
      default:
	sprintf (m -> name, "%s", ">");
	m -> tokentype = GT;
	return GT;
	break;
      }
  }

  /* BIT_COMP */
  if (c == '~') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = BIT_COMP;
    return BIT_COMP;
  }

  /* BIT_XOR, BIT_XOR_ASSIGN */
  if (c == '^') {
    switch (buf[*idx])
      {
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "^=");
	m -> tokentype = BIT_XOR_ASSIGN;
	return BIT_XOR_ASSIGN;
	break;
      default:
	sprintf (m -> name, "%c", c);
	m -> tokentype = BIT_XOR;
	return BIT_XOR;
	break;
      }
  }

  /* PERIOD, ELLIPSIS */
  if (c == '.') {
    if (buf[*idx] == '.' && buf[*idx+1] == '.') {
      (*idx) += 2;
      sprintf (m -> name, "%s", "...");
      m -> tokentype = ELLIPSIS;
      return ELLIPSIS;
    } else {
      sprintf (m -> name, "%c", c);
      m -> tokentype = PERIOD;
      return PERIOD;
    }
  }

  if (c == '?') {
    if (use_trigraphs_opt && buf[*idx] == '?' &&
	unescape_trigraph (&buf[*idx-1])) {
      /*
      *  buf[*idx] is the second '?' in the trigraph.
      */
      sprintf (m -> name, "%c", unescape_trigraph (&buf[(*idx)-1]));
      m -> tokentype=trigraph_tokentype (unescape_trigraph (&buf[*idx-1]));
      if (warn_trigraphs_opt) {
      char s[5];
      substrcpy (s, &buf[*idx-1], 0, 3);
      _warning ("%s:%d: Warning: Trigraph sequence %s.\n", 
		source_file, error_line, s);
      }
      *idx += 2;
      return m -> tokentype;
    }
    sprintf (m -> name, "%c", c);
    m -> tokentype = CONDITIONAL;
    return CONDITIONAL;
  }

  if (c == ':') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = COLON;
    return COLON;
  }

  /* BIT_OR, BOOLEAN_OR, BIT_OR_ASSIGN */
  if (c == '|') {
    switch (buf[*idx])
      {
      case '|':
	(*idx)++;
	sprintf (m -> name, "%s", "||");
	m -> tokentype = BOOLEAN_OR;
	return BOOLEAN_OR;
	break;
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "|=");
	m -> tokentype = BIT_OR_ASSIGN;
	return BIT_OR_ASSIGN;
	break;
      default:
	sprintf (m -> name, "%c", c);
	m -> tokentype = BAR;
	return BAR;
	break;
      }
  }

  /* AMPERSAND, BIT_AND_ASSIGN, BOOLEAN_AND */
  if (c == '&') {
    int c_idx;
    switch (buf[*idx])
      {
      case '&':
	(*idx)++;
	sprintf (m -> name, "%s", "&&");
	m -> tokentype = BOOLEAN_AND;
	return BOOLEAN_AND;
	break;
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "&=");
	m -> tokentype = BIT_AND_ASSIGN;
	return BIT_AND_ASSIGN;
	break;
      default:
	sprintf (m -> name, "%c", c);
	/*
	 *  If the preceding non-whitespace character is an operator,
	 *  then the ampersand is an address-of operator, otherwise
	 *  it is a bitwise and operator.
	 */
	for (c_idx = (*idx) - 2; c_idx >= 0; c_idx--) {
	  if (isspace ((int)buf[c_idx])) 
	    continue;
	  if (IS_C_OP_CHAR (buf[c_idx])) {
	    m -> tokentype = AMPERSAND;
	    return AMPERSAND;
	  } else {
	    m -> tokentype = BIT_AND;
	    return BIT_AND;
	  }
	}
	break;
      }
  }

  /* EQ, BOOLEAN_EQ */
  if (c == '=') {
    switch (buf[*idx])
      {
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "==");
	m -> tokentype = BOOLEAN_EQ;
	return BOOLEAN_EQ;
	break;
      default:
	sprintf (m -> name, "%c", c);
	m -> tokentype = EQ;
	return EQ;
	break;
      }
  }

  /* BIT_COMP */
  if (c == '~') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = BIT_COMP;
    return BIT_COMP;
  }

  /* EXCLAM, INEQUALITY */
  if (c == '!') {
    switch (buf[*idx]) 
      {
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "!=");
	m -> tokentype = INEQUALITY;
	return INEQUALITY;
	break;
      default:
	sprintf (m -> name, "%c", c);
	m -> tokentype = EXCLAM;
	return EXCLAM;
	break;
      }
  }

  /* FLOAT, INTEGER, LONG, and LONGLONG                                  */
  /* TO DO - 
     Bring into conformance with inttypes.h, limits.h, and float.h. */
  /* The sign and suffixes, if any, are included in the token.           */
  /* A + or - is considered to be unary if it is preceded by another
     op.  See the NOEVAL_CHAR_SUFFIX define in plex.h, and 
     op_follows_noeval () below. */
  if (isdigit ((int)c) || 
      (c == '.' && isdigit((int)buf[*idx])) ||
      (c == '-' && isdigit((int)buf[*idx])) ||
      IS_SIGN (c, buf[*idx])) {

    if ((c == '-' || c == '+') && !op_follows_noeval (buf, &buf[*idx-1])) {
      sprintf (m -> name, "%c", c);
      m -> tokentype = MINUS;
      return MINUS;
    }

    RADIX radix = decimal;

    (*idx)--;

    j = *idx;
    numeric_type = INTEGER_T;

    /* Account for a signed hexadecimal or octal constant. */
    if (!IS_SIGN (buf[j],buf[j+1])) {
      if ((buf[j] == (char)'0') && (buf[j + 1] != (char)'.')) {
	if (buf[j+1] == (char) 'x' || buf[j+1] == 'X') {
	  radix = hexadecimal;
	  j += 2;
	} else {
	  if (buf[j+1] && (buf[j+1] >= '0' && buf[j+1] <= '7')) {
	    radix = octal;
	    j += 1;
	  }
	}
      }
    } else {
      if ((buf[j+1] == (char)'0') && (buf[j + 2] != (char)'.')) {
	if (buf[j+2] == (char) 'x' || buf[j+2] == 'X') {
	  radix = hexadecimal;
	  j += 3;
	  _warning ("Signed hexadecimal constant.");
	} else {
	  if (buf[j+2] && (buf[j+2] >= '0' && buf[j+2] <= '7')) {
	    radix = octal;
	    j += 2;
	    _warning ("Signed octal constant.");
	  }
	}
      }
    }

    while (buf[j]) {

      if (buf[j] == (char)'.' && isdigit (buf[j+1]))
 	numeric_type = DOUBLE_T;

      if ((numeric_type == INTEGER_T) && 
	  ((buf[j] == (char)'l' || buf[j] == (char)'L') &&
	   (buf[j+1] != (char)'l' || buf[j+1] != (char)'L')))
 	numeric_type = LONG_T;

      if (((numeric_type == INTEGER_T) || (numeric_type == LONG_T)) && 
	  ((buf[j] == (char)'l' || buf[j] == (char)'L') && 
	   (buf[j+1] == (char)'l' || buf[j+1] == (char)'L')))
 	numeric_type = LONGLONG_T;

      if ((numeric_type == DOUBLE_T) && 
	  ((buf[j] == (char)'f') || (buf[j] == (char)'F')))
	numeric_type = FLOAT_T;

      if ((numeric_type == DOUBLE_T) && 
	  ((buf[j] == (char)'l') || (buf[j] == (char)'L')))
	numeric_type = LONGDOUBLE_T;

      /* Decimal integers can also have 'u' and 'U' suffixes. */
      if ((radix == decimal) && 
	  (numeric_type == INTEGER_T) && 
	  ! (isdigit ((int)buf[j]) || IS_SIGN(buf[j],buf[j+1]) ||
	     buf[j] == (char)'u' || buf[j] == (char)'U'))
	break;

       if ((radix == decimal) &&
 	  (numeric_type == LONG_T || numeric_type == LONGLONG_T) &&
	   ! (isdigit ((int)buf[j]) || IS_SIGN(buf[j],buf[j+1]) ||
	      buf[j] == (char)'l' || buf[j] == (char)'L' ||
	      buf[j] == (char)'u' || buf[j] == (char)'U'))
 	break;

      /* Octals need range checking. */
      if ((radix == octal) && isdigit(buf[j]) && 
	  ((buf[j] < (char)'0') || (buf[j] > (char)'7')))
	_error ("Bad octal constant.");
      if ((radix == octal) && ((buf[j] < (char)'0') || (buf[j] > (char)'7')))
	break;

      if ((radix == hexadecimal) && 
	  (!isxdigit ((int) buf[j]) && 
	   (buf[j] != (char)'l') && (buf[j] != (char)'L')))
	break;

      if (((numeric_type == DOUBLE_T) ||
	   (numeric_type == FLOAT_T) ||
	   (numeric_type == LONGDOUBLE_T))
	  &&
	  (!isdigit ((int)buf[j]) &&
	   (buf[j] != '.') &&
	   (buf[j] != 'e') &&
	   (buf[j] != 'E') &&
	   !IS_SIGN (buf[j],buf[j+1])))
	break;
	   
      j++;
    }
    substrcpy (m -> name, buf, *idx, j - *idx);
    *idx = j;
    switch (numeric_type)
      {
      case INTEGER_T:
	m -> tokentype = INTEGER;
	return INTEGER;
	break;
      case DOUBLE_T:
      case FLOAT_T:
      case LONGDOUBLE_T:
 	m -> tokentype = DOUBLE;
 	return DOUBLE;
	break;
      case LONG_T:
	m -> tokentype = LONG;
	return LONG;
	break;
      case LONGLONG_T:
	m -> tokentype = LONGLONG;
	return LONGLONG;
	break;
      }
  }

  /* PLUS, PLUS_ASSIGN, INCREMENT */
  /* Note that this occurs after the numeric constant tokens. */
  if (c == '+') {
    switch (buf[*idx])
      {
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "+=");
	m -> tokentype = PLUS_ASSIGN;
	return PLUS_ASSIGN;
	break;
      case '+':
	(*idx)++;
	sprintf (m -> name, "%s", "++");
	m -> tokentype = INCREMENT;
	return INCREMENT;
	break;
      default:
	sprintf (m -> name, "%c", c);
	m -> tokentype = PLUS;
	return PLUS;
	break;
      }
  }

  /* MINUS, MINUS_ASSIGN, DECREMENT, DEREF */
  /* Note that this occurs after the checking for numeric constants. */
  if (c == '-') {
    switch (buf[*idx])
      {
      case '=':
	(*idx)++;
	sprintf (m -> name, "%s", "-=");
	m -> tokentype = MINUS_ASSIGN;
	return MINUS_ASSIGN;
	break;
      case '-':
	(*idx)++;
	sprintf (m -> name, "%s", "--");
	m -> tokentype = DECREMENT;
	return DECREMENT;
	break;
      case '>':
	(*idx)++;
	sprintf (m -> name, "%s", "->");
	m -> tokentype = DEREF;
	return DEREF;
	break;
      default:
	sprintf (m -> name, "%c", c);
	m -> tokentype = MINUS;
	return MINUS;
	break;
      }
  }

  /* PREPROCESS, LITERALIZE, MACRO_CONCAT */
  if (c == '#') {
    /*
     *  We need to ensure that the token is the first *non-whitespace*
     *  token after a newline, or is the first non-whitespace 
     *  token in the text.
     *
     *  The calling functions in preprocess.c should set,
     *  "rescanning," to True when rescanning individual arguments
     *  that have '#' at or near the beginning of the text, so the
     *  code below knows that the '#' characters do not represent the
     *  beginning of preprocessor directives.
     */
    int lookback = ((*idx - 2) > 0) ? *idx - 2 : 0;

    while (((buf[lookback] == ' ') || 
	   (buf[lookback] == '\t')) &&
	   (lookback > 0)) {
      lookback--;
    }

    if ((buf[lookback] == '\n' || lookback == 0) && ! rescanning) {

      /* 
       *  If this is a line marker, parse it and set the 
       *  error location.
       */

      if (isdigit ((int)buf[*idx + 1]) && !line_info_line) {
	char *n, linebuf[MAXMSG];
	if ((n = index (&buf[*idx + 1], '\n')) != NULL) {
	  substrcpy (linebuf, &buf[*idx-1], 0, n - &buf[*idx - 1]);
	} else {
	  strcpy (linebuf, &buf[*idx - 1]);
	}
	line_info_tok (linebuf);
      }

      m -> name[0] = c;
      m -> name[1] = 0;
      m -> tokentype = PREPROCESS;
      preprocess_line = TRUE;
      return PREPROCESS;
    } else {
      if (buf[*idx] == '#') {
	(*idx)++;
	sprintf (m -> name, "%s", "##");
	m -> tokentype = MACRO_CONCAT;
	return MACRO_CONCAT;
      } else {
	sprintf (m -> name, "%s", "#");
	m -> tokentype = LITERALIZE;
	return LITERALIZE;
      }
    }
  }

  /* SIZEOF */
  if (c == 's' && !strncmp (&buf[*idx], "izeof", 5)) {
    (*idx) += 5;
    strcpy (m -> name, "sizeof");
    m -> tokentype = SIZEOF;
    return SIZEOF;
  }

  /* LABEL */
  if (isalnum ((int)c) || c == '_' || c == '$') {
    (*idx)--;

    q = tmpbuf;
    while (isalnum ((int)buf[*idx]) || (buf[*idx] == '_') 
	   || (buf[*idx] == '$'))
      *q++ = buf[(*idx)++];
    *q = 0;

    strcpy (m -> name, tmpbuf);
    m -> tokentype = LABEL;
    return LABEL;
  }
    
  /* LITERAL */
  /* Check for quote characters ('"') and escaped quotes within
     the literal. */
  if (c == '"') {
    int j, k, in_str, esc_chr;
    int n_quotes;
    int input_length;
    int scan_c_cmt, scan_cplus_cmt;
    int start_str, end_str;
    int backoff;
    int str_length_org;
    char *splice_ptr;

    if (!in_c_cmt && !in_cplus_cmt) {

      input_length = strlen (&buf[*idx]);
      /*
       *  Easiest way to ensure that message can hold an
       *  entire string.  input_length + 2 is the maximum
       *  remaining input + opening quote + 1.
       */
      if (input_length > MAXLABEL) resize_message (m, input_length + 2);

      /*
       *  Esc_chr needs a default value that can't be mistaken
       *  for a <string index> - 1, so use -2.
       */

      for (j = *idx - 1, start_str = *idx - 1, in_str = FALSE, k = 0, 
	     esc_chr = -2,n_quotes = 0, end_str = -1,
	     scan_c_cmt = FALSE, scan_cplus_cmt = FALSE;
	   (j - *idx <= input_length) && buf[j]; 
	   j++) {
	switch (buf[j])
	  {
	  case '\"':
	    if (esc_chr == (j - 1)) {
	      esc_chr = -2;
	      if (end_str == -1) {m -> name[k++] = buf[j];m -> name[k] = 0;}
	    } else {
	      if (!scan_c_cmt && !scan_cplus_cmt)
		++n_quotes;
	      if (in_str == TRUE) {
		in_str = FALSE;
		if (end_str == -1) {
		  m -> name[k++] = buf[j];
		  m -> name[k] = 0;
		}
		end_str = j;
	      } else {
		in_str = TRUE;
		if (end_str == -1) { m -> name[k++] = buf[j];m -> name[k] = 0;}
	      }
	    }
	    break;
	  case '\\':
	    if (esc_chr == (j - 1)) {
	      esc_chr = -2;
	      if (end_str == -1) { m -> name[k++] = buf[j]; m -> name[k] = 0; }
	    } else {
	      esc_chr = j;
	      if (end_str == -1) { m -> name[k++] = buf[j]; m -> name[k] = 0; }
	    }
	    break;
	  case '/':
	    /*
	     *  Note the start of a comment.
	     */
	    if (!in_str) {
	      switch (buf[j+1])
		{
		case '*':
		  scan_c_cmt = TRUE;
		  break;
		case '/':
		  scan_cplus_cmt = TRUE;
		  break;
		}
	    } else {
	      if (end_str == -1) {m -> name[k++] = buf[j];m -> name[k] = 0;}
	    }
	    break;
	  case '*':
	    /*
	     *  Note the end of a C comment.
	     */
	    if (!in_str) {
	      if ((buf[j+1] == '/') && scan_c_cmt)
		scan_c_cmt = FALSE;
	    } else {
	      if (end_str == -1) {m -> name[k++] = buf[j]; m -> name[k] = 0; }
	    }
	    break;
	  case '\n':
	    /*
	     *  Note the end of a C++ comment, or stop at the 
	     *  end of a line that contains a preprocess 
	     *  directive.
	     */
	    if (!in_str)
	      if (scan_cplus_cmt) scan_cplus_cmt = FALSE;

	    if (preprocess_line) {
	      /*
	       *  Line splice - handle below.
	       */
	      if (esc_chr != (j - 1)) {
		goto eoppl;
	      } else {
		if (end_str == -1) { m -> name[k++] = buf[j]; m -> name[k] = 0;}
	      }
	    } else {
	      if (end_str == -1) { 
		m -> name[k++] = buf[j]; 
		m -> name[k] = 0; 
	      }
	    }
	    break;
	  case '\r':
	    /*
	     *  The same as above, but handle MS-DOG line endings
	     *  also.  NOTE - This is still largely untested.
	     */
	    if (!in_str)
	      if (scan_cplus_cmt) scan_cplus_cmt = FALSE;
	    if (preprocess_line) {
	      /*
	       *  Line splice - Handle below.
	       */
	      if (esc_chr != (j - 1)) {
		goto eoppl;
	      } else {
		if (end_str == -1) { 
		  m -> name[k++] = buf[j++];
		  m -> name[k++] = buf[j];
		  m -> name[k] = 0;
		}
	      }
	    } else {
	      if (end_str == -1) { 
		m -> name[k++] = buf[j++]; 
		m -> name[k++] = buf[j]; 
		m -> name[k] = 0; 
	      }
	    }
	    break;
	  default:
	    if (end_str == -1) { m -> name[k++] = buf[j]; m -> name[k] = 0;}
	    break;
	  }
      }

      /*
       *  If the number of quotes to the end of the input is odd,
       *  then we have an unterminated string somehow.  
       *
       *  Because in this case we have incorrectly found the 
       *  string end at the quote that begins the next string,
       *  back off to:
       *
       *  1. The newline if we are scanning a preprocess line, 
       *     if it exists.  In this case, terminate the string
       *     at the newline.
       *  2. The character before the start of the next quote.
       *     There aren't any test cases yet to determine where
       *     to terminate the string correctly, so it is simply 
       *     all the characters until the next quote.  The next
       *     quote is, of course, included in the next string.
       */

      eoppl:

      /* 
       *   First, though, remove line splices, for
       *   both *NIX and MS-DOG;
       *
       *   Str_length_org is the length of the 
       *   string as it appears in the input 
       *   buffer.
       */

      str_length_org = strlen (m -> name);
      while ((splice_ptr = strstr (m -> name, "\\\n")) != NULL) {
	strcpy (splice_ptr, splice_ptr + strlen ("\\\n"));
	++error_line;
      }
      while ((splice_ptr = strstr (m -> name, "\\\r\n")) != NULL) {
	strcpy (splice_ptr, splice_ptr + strlen ("\\\r\n"));
	++error_line;
      }

      if (n_quotes % 2) {
	_warning ("%s:%d:warning: Unterminated string constant.\n",
		  source_file, error_line);
	if (preprocess_line) {
	  if (index (m -> name, '\n')) {
	    backoff = index (m -> name, '\0') - index (m -> name, '\n') + 1;
	    m -> name[index (m -> name, '\n') - m -> name] = 0;
	  } else { /* if (index (m -> name, '\n')) */
	    backoff = 2;
	    m -> name[k] = 0;
	  }
	} else { /* if (preprocess_line) */
	  backoff = 2;
	  m -> name[k] = 0;
	}
      } else { /* if (n_quotes % 2) */
	backoff = 1;
	m -> name[k] = 0;
      }

      *idx += str_length_org - backoff;
      m -> tokentype = LITERAL;
      return LITERAL;
    } /*     if (!in_c_cmt && !in_cplus_cmt) */
  } /*   if (c == '"') */

  /* LITERAL_CHAR in single quotes.*/

  if (c == '\'') {
    if (!in_c_cmt && !in_cplus_cmt) {
      int i = 0;
      m -> name[i++] = c;
	while (1) {
	  m -> name[i++] = buf[(*idx)++];
	  /*
	   *  If quoting an escaped single quote, break
	   *  on the final single quote only.
	   */
	  if ((buf[(*idx) - 1] == '\'') && (buf[*idx] != '\''))
	    break;
	}
      if (!is_char_constant (m -> name))
	_warning ("%s:%d: Warning: Unknown character sequence %s.\n", 
		  source_file, error_line, m -> name);
      m -> tokentype = LITERAL_CHAR;
      return LITERAL_CHAR;
    }
  }

  /* CR - LF  to NEWLINE.
     A newline token can contain a run of newlines, *but it 
     should contain no other characters* between the newlines, 
     otherwise, line numbers will not be calculated correctly 
     in tokenize (), below. */ 
  if (c == '\r' || c == '\n') {
      int buflength = 0;

    if (buf[*idx - 2] == '\\') {
      strcpy (m -> name, " ");
      /* Increment the error line and reset error column also. */
      ++error_line; error_column = 0;
      m -> tokentype = WHITESPACE;
      return WHITESPACE;
    }

    strcpy (m -> name, "\n");

    while (((buf[*idx] == '\r') || (buf[*idx] == '\n')) &&
	   (buflength < (MAXLABEL / 2))) {
      if (buf[*idx] == c) {
	strcat (m -> name, "\n");
	/* Don't increment the error line here - 
	   set_error_location does that for all tokens that 
	   contain newlines. */
      }
      (*idx)++; buflength++;
    }
    error_column = 0;
    if (preprocess_line)
      preprocess_line = FALSE;
    if (in_cplus_cmt)
      in_cplus_cmt = 0;
    m -> tokentype = NEWLINE;
    return NEWLINE;
  }

  /* ARGSEPARATOR */
  if (c == ',') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = ARGSEPARATOR;
    return ARGSEPARATOR;
  }

  /* ARRAYOPEN */
  if (c == '[') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = ARRAYOPEN;
    return ARRAYOPEN;
  }

  /* ARRAYCLOSE */
  if (c == ']') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = ARRAYCLOSE;
    return ARRAYCLOSE;
  }

  /* MODULUS */
  if (c == '%') {
    sprintf (m -> name, "%c", c);
    m -> tokentype = MODULUS;
    return MODULUS;
  }

  if (c == 0) {
    m -> tokentype = EOS;
    *(m -> name) = 0;
    return 0;
  }

  if (c == '\\') {
    /* 
     *  Line splice - replace escape and newline with whitespace.  
     */
    if (!strncmp (&buf[*idx], "\r\n", 2) ||
	!strncmp (&buf[*idx], "\n\r", 2)) {
      strcpy (m -> name, " ");
      m -> tokentype = WHITESPACE;
      (*idx) += 2;
      ++linesplice;
      return WHITESPACE;
    } else {
      if ((buf[*idx] == '\n') || (buf[*idx] == '\r')) {
	strcpy (m -> name, " ");
	m -> tokentype = WHITESPACE;
	++(*idx);
	++linesplice;
	return WHITESPACE;
      }
    }
  }

  sprintf (m -> name, "%c", c);
  m -> tokentype = CHAR;
  return c;
}

/* Tokenize a character string and place the messages on the 
   message stack used by the (push() () function.
   Return the last stack pointer.
*/

int tokenize (int (*push)(MESSAGE *), char *buf) {

  long long i = 0L, max;
  int tok, lasttok, lastlangtok;
  MESSAGE *m = NULL, 
    *m_cmt = NULL;
  int stack_ptr = ERROR;
  char tbuf[MAXLABEL],
    s[MAXMSG];

  max = (long long) strlen (buf);

  i = in_c_cmt = in_cplus_cmt = linesplice = 0; 

  gnuc_attribute = FALSE;
  while (i < max) {

    if ((m = new_message()) == NULL)
      _error ("Tokenize: Invalid new_message.");

    tok = lexical (buf, &i, m);  /* get token */

    lasttok = tok;
    if (tok != WHITESPACE)
      lastlangtok = tok;

    if (keepcomments_opt) {

      /*
       *  Resize the message name to the size of the input, and
       *  collect the tokens.
       */
      if (tok == OPENCCOMMENT) {
	if ((++comment > 1) && warnnestedcomments_opt) {
	  sprintf (s, "%s: %d: Warning: \'\\*\' within a comment.\n",
		   source_file, error_line);
	  _warning (s);
	}
	if (comment == 1) {
	  m_cmt = m;
	  stack_ptr = (push) (m_cmt);
	  strcpy (tbuf, m_cmt -> name);
  	  free (m_cmt -> name);
 	  m_cmt -> name = calloc (input_size + 1, sizeof (char));
	  strcpy (m_cmt -> name, tbuf);
	} else {
	  strcat (m_cmt -> name, m -> name);
	  delete_message (m);
	}
	set_error_location (m_cmt, &error_line, &error_column);
      } else {
	if (tok == CLOSECCOMMENT) {
	  --comment;
	  strcat (m_cmt -> name, m -> name);
	  set_error_location (m, &error_line, &error_column);
	  delete_message (m);
	  if (!comment) {
	    m_cmt -> tokentype = PREPROCESS_EVALED;
	  }
	} else {
          if (comment) {
            strcat (m_cmt -> name, m -> name);
            set_error_location (m, &error_line, &error_column);
            delete_message (m);
          } else {
            if (tok == CPPCOMMENT) {
              m -> tokentype = PREPROCESS_EVALED;
              stack_ptr = (push) (m);
            } else {
	      stack_ptr = (push) (m);
	      set_error_location (m, &error_line, &error_column);
	    }
	  }
	}
      }

    } else { /* if (keepcomments_opt) */

      if (tok == OPENCCOMMENT){
	if ((++comment > 1) && warnnestedcomments_opt) {
	  char buf[MAXMSG];
	  sprintf (buf, "%s: %d: Warning: \'\\*\' within a comment.\n",
		   source_file, error_line);
	  _warning (buf);
	}
	strcpy (m -> name, " ");
	m -> tokentype = WHITESPACE;
	stack_ptr = (push) (m);
	set_error_location (m, &error_line, &error_column);
	continue;
      }

      if (tok == CLOSECCOMMENT) {
	delete_message (m);
	--comment;
	continue;
      }

      if (tok == CPPCOMMENT) {
	strcpy (m -> name, " ");
	m -> tokentype = WHITESPACE;
	stack_ptr = (push) (m);
	set_error_location (m, &error_line, &error_column);
	continue;
      }

      if (comment && m -> tokentype != NEWLINE) {
	delete_message (m);
	continue;
      }

      /*
       *  If lexical () encounters a line splice, 
       *  the function replaces it with white space.
       *  However, in order to keep the line numbers
       *  straight, we insert an equal number of spaces
       *  after the logical line - when lexical () 
       *  reaches the actual newline.
       *  
       *  We do this here so that we don't have to
       *  worry about inserting a line marker after a
       *  macro.
       */
      if (m -> tokentype == NEWLINE && linesplice) {
	int nl;
	for (nl = 1; nl <= linesplice; nl++)
	  strcat (m -> name, "\n");
	linesplice = 0;
      }

      stack_ptr = (push) (m);
      set_error_location (m, &error_line, &error_column);
    } /* !keepcomments_opt */

  }

  error_reset ();

  return stack_ptr;
}

/*
 *   Return true if the preceding message matches c.
 */
int prefix (MESSAGE_STACK messages, int msg_ptr, int c) {

  MESSAGE *m;

  m = messages[msg_ptr+1];

  if (!m || !IS_MESSAGE (m))
    return FALSE;

  if (m -> name[0] == (char) c)
    return TRUE;

  return FALSE;
}

int set_error_location (MESSAGE *m, int *line, int *col) {

  char *p, *q;

  m -> error_column = *col;
  m -> error_line = *line;
  if (m -> tokentype == NEWLINE) {
    *col = 1;
    *line += strlen (m -> name);
  } else {
    *col += strlen (m -> name);
  }
  if (m -> tokentype == LITERAL) {
    for (p = m -> name, q = index (m -> name, '\n'); ; 
	 q = index (p+1, '\n')) {
      if (!q)
	break;
      *col = 0;
      ++(*line);
      p = q;
    }      
    *col += strlen (p);
  }
  return SUCCESS;
}

/*
 * Collect messages into a buffer for retokenization. 
 */

char *collect_tokens (MESSAGE_STACK messages, int start, int end) {

  int i,
    buflength;
  char *buf;

  for (i = start, buflength = 0; i >= end; i--)
    buflength += strlen (messages[i] -> name);

  if ((buf = calloc (buflength + 1, sizeof (char))) == NULL)
    _error ("collect_tokens: %s.", strerror (errno));

  for (i = start; i >= end; i--) 
    strcat (buf, messages[i] -> name);

  return buf;
}

char unescape_trigraph (char *s) {

  int i;
  struct _trigraph {char key, val; } trigraphs[] =
    { {'=',  '#'},
      {'(',  '['},
      {'/',  '\\'},
      {')',  ']'},
      {'\'', '^'},
      {'<',  '{'},
      {'!',  '|'},
      {'>',  '}'},
      {'-',  '~'},
      {0,    0}};

  for (i = 0; trigraphs[i].key; i++) {
    if (trigraphs[i].key == s[2])
      return trigraphs[i].val;
  }
  return 0;
}

int trigraph_tokentype (char c) {

  int i;
  struct _trigraph {char key; int val; } trigraphs[] =
    { {'#',  PREPROCESS },
      {'[',  ARRAYOPEN  },
      {'\\',  BACKSLASH  },
      {']',  ARRAYCLOSE },
      {'^', BIT_XOR    },
      {'{',  OPENBLOCK  },
      {'|',  BAR        },
      {'}',  CLOSEBLOCK },
      {'~',  BIT_COMP   },
      {0,    0}};

  for (i = 0; trigraphs[i].key; i++) {
    if (trigraphs[i].key == c)
      return trigraphs[i].val;
  }
  return 0;
}

/*
 *  Return True if a string contains a valid
 *  character constant or escape sequence.
 */
int is_char_constant (char *s) {

  RADIX r;
  char t[MAXMSG];

  strcpy (t, s);
  if (t[0] == '\'')
    TRIM_CHAR(t);

  switch (strlen (t))
    {
    case 0:
      return FALSE;
      break;
    case 1:
      if (t[0] == '\'' || t[0] == '\\' || t[0] == '\n')
	return FALSE;
      else
	return TRUE;
      break;
    case 2:
      /*
       *  Simple escape sequences.
       */
      if (t[0] == '\\') {
	if (t[1] == '\'' || t[1] == '\"' || t[1] == '\?' || 
	    t[1] == '\\' || t[1] == 'a' || t[1] == 'b' ||
	    t[1] == 'f' || t[1] == 'n' || t[1] == 'r' ||
	    t[1] == 't' || t[1] == 'v' || t[1] == '0')
	  return TRUE;
	else
	  return FALSE;
      } else {
	return FALSE;
      }
      break;
    default:
      /*
       *  Numeric escape sequences.
       */
      if (t[0] == '\\') {
	if ((r = radix_of (&t[1])) != ERROR) {
	  return TRUE;
	} else {
	  return FALSE;
	}
      }
      break;
    }

  return FALSE;
}

/*
 *  Used to check whether a + or - should be considered a sign.
 *  If the non-whitespace character preceding the + or -, pointed
 *  to by *buf_idx, cannot end an operand, it is considered to be 
 *  an operator, and the + or - is considered to be a numeric sign
 *  for the following constant. See the NOEVAL_CHAR_SUFFIX #define 
 *  in plex.h.
 */
int op_follows_noeval (char *buf_start, char *buf_idx) {

  char *q = buf_idx - 1;

  while (isspace((int)*q) && (q >= buf_start))
    --q;

  if (index (NOEVAL_CHAR_SUFFIX, *q))
    return TRUE;

  return FALSE;
}
