Logo Search packages:      
Sourcecode: linux86 version File versions  Download package

tok_io.c

/*++
/* NAME
/*    tok_io 3
/* SUMMARY
/*    token I/O
/* PACKAGE
/*    unproto
/* SYNOPSIS
/*    #include "token.h"
/*
/*    struct token *tok_get()
/*
/*    void tok_flush(t)
/*    struct token *t;
/*
/*    void tok_show(t)
/*    struct token *t;
/*
/*    void tok_show_ch(t)
/*    struct token *t;
/*
/*    void put_str(s)
/*    char *s;
/*
/*    void put_ch(c)
/*    int c;
/*
/*    void put_nl()
/*
/*    char *in_path;
/*    int in_line;
/* DESCRIPTION
/*    These functions read from stdin and write to stdout. The
/*    tokenizer keeps track of where the token appeared in the input
/*    stream; on output, this information is used to preserve correct
/*    line number information (even after lots of token lookahead or
/*    after function-header rewriting) so that diagnostics from the
/*    next compiler stage make sense.
/*
/*    tok_get() reads the next token from standard input. It returns
/*    a null pointer when the end of input is reached.
/*
/*    tok_show() displays the contents of a (possibly composite) token
/*    on the standard output.
/*
/*    tok_show_ch() displays the contents of a single-character token
/*    on the standard output. The character should not be a newline.
/*
/*    tok_flush() displays the contents of a (possibly composite) token
/*    on the standard output and makes it available for re-use.
/*
/*    put_str() writes a null-terminated string to standard output.
/*    There should be no newline characters in the string argument.
/*
/*    put_ch() writes one character to standard output. The character
/*    should not be a newline.
/*
/*    put_nl() outputs a newline character and adjusts the program's idea of
/*    the current output line.
/*
/*    The in_path and in_line variables contain the file name and
/*    line number of the most recently read token.
/* BUGS
/*    The tokenizer is just good enough for the unproto filter.
/*    As a benefit, it is quite fast.
/* AUTHOR(S)
/*    Wietse Venema
/*    Eindhoven University of Technology
/*    Department of Mathematics and Computer Science
/*    Den Dolech 2, P.O. Box 513, 5600 MB Eindhoven, The Netherlands
/* LAST MODIFICATION
/*    92/01/15 21:52:59
/* VERSION/RELEASE
/*    1.3
/*--*/

static char io_sccsid[] = "@(#) tok_io.c 1.3 92/01/15 21:52:59";

/* C library */

#include <stdio.h>
#include <ctype.h>

extern char *strchr();
extern char *malloc();
extern char *realloc();
extern char *strcpy();

/* Application-specific stuff */

#include "token.h"
#include "vstring.h"
#include "error.h"

extern char *strsave();             /* XXX need include file */

/* Stuff to keep track of original source file name and position */

static char def_path[] = "";        /* default path name */

char   *in_path = def_path;         /* current input file name */
int     in_line = 1;                /* current input line number */

static char *out_path = def_path;   /* last name in output line control */
static int out_line = 1;            /* current output line number */
int     last_ch;              /* type of last output */

/* Forward declarations */

static int read_quoted();
static void read_comment();
static int backslash_newline();
static char *read_hex();
static char *read_octal();
static void fix_line_control();

 /*
  * Character input with one level of pushback. The INPUT() macro recursively
  * strips backslash-newline pairs from the input stream. The UNPUT() macro
  * should be used only for characters obtained through the INPUT() macro.
  * 
  * After skipping a backslash-newline pair, the input line counter is not
  * updated, and we continue with the same logical source line. We just
  * update a counter with the number of backslash-newline sequences that must
  * be accounted for (backslash_newline() updates the counter). At the end of
  * the logical source line, an appropriate number of newline characters is
  * pushed back (in tok_get()). I do not know how GCC handles this, but it
  * seems to produce te same output.
  * 
  * Because backslash_newline() recursively calls itself (through the INPUT()
  * macro), we will run out of stack space, given a sufficiently long
  * sequence of backslash-newline pairs.
  */

static char in_char = 0;            /* push-back storage */
static int in_flag = 0;             /* pushback available */
static int nl_compensate = 0;       /* line continuation kluge */

#define INPUT(c) (in_flag ? (in_flag = 0, c = in_char) : \
                (c = getchar()) != '\\' ? c : \
                (c = getchar()) != '\n' ? (ungetc(c, stdin), c = '\\') : \
                (c = backslash_newline()))
#define     UNPUT(c) (in_flag = 1, in_char = c)

/* Directives that should be ignored. */

#ifdef IGNORE_DIRECTIVES

static char *ignore_directives[] = {
    IGNORE_DIRECTIVES,
    0,
};

#endif

/* Modified string and ctype stuff. */

#define     STREQUAL(x,y)     (*(x) == *(y) && strcmp((x),(y)) == 0)

#define     ISALNUM(c)  (isalnum(c) || (c) == '_')
#define     ISALPHA(c)  (isalpha(c) || (c) == '_')
#define     ISSPACE(c)  (isspace(c) && c != '\n')
#define     ISDOT(c)    (c == '.')
#define     ISHEX(c)    (isdigit(c) || strchr("abcdefABCDEF", c) != 0)
#define     ISOCTAL(c)  (isdigit(c) && (c) != '8' && (c) != '9')

/* Collect all characters that satisfy one condition */

#define     COLLECT(v,c,cond) { \
                        register struct vstring *vs = v; \
                        register char *cp = vs->str; \
                        *cp++ = c; \
                        while (INPUT(c) != EOF) { \
                            if (cond) { \
                              if (VS_ADDCH(vs, cp, c) == 0) \
                                  fatal("out of memory"); \
                            } else { \
                              UNPUT(c); \
                              break; \
                            } \
                        } \
                        *cp = 0; \
                      }

/* Ensure that output line information is correct */

#define     CHECK_LINE_CONTROL(p,l) { if (out_path != (p) || out_line != (l)) \
                              fix_line_control((p),(l)); }

/* do_control - parse control line */

static int do_control()
{
    struct token *t;
    int     line;
    char   *path;

    /* Make sure that the directive shows up in the right place. */

    CHECK_LINE_CONTROL(in_path, in_line);

    while (t = tok_get()) {
      switch (t->tokno) {

      case TOK_WSPACE:
          /* Ignore blanks after "#" token. */
          tok_free(t);
          break;

      case TOK_NUMBER:

          /*
           * Line control is of the form: number pathname junk. Since we
           * have no idea what junk the preprocessor may generate, we copy
           * all line control tokens to stdout.
           */

          put_str("# ");
          line = atoi(t->vstr->str);            /* extract line number */
          tok_flush(t);
          while ((t = tok_get()) && t->tokno == TOK_WSPACE)
            tok_flush(t);                 /* copy white space */
          if (t) {                        /* extract path name */
            path = (t->tokno == '"') ? strsave(t->vstr->str) : in_path;
            do {
                tok_flush(t);       /* copy until newline */
            } while (t->tokno != '\n' && (t = tok_get()));
          }
          out_line = in_line = line;            /* synchronize */
          out_path = in_path = path;            /* synchronize */
          return;

#ifdef IGNORE_DIRECTIVES

      case TOK_WORD:

          /*
           * Optionally ignore other #directives. This is only a partial
           * solution, because the preprocessor will still see them.
           */
          {
            char  **cpp;
            char   *cp = t->vstr->str;

            for (cpp = ignore_directives; *cpp; cpp++) {
                if (STREQUAL(cp, *cpp)) {
                  do {
                      tok_free(t);
                  } while (t->tokno != '\n' && (t = tok_get()));
                  return;
                }
            }
          }
          /* FALLTHROUGH */
#endif
      default:
          /* Pass through. */
          put_ch('#');
          do {
            tok_flush(t);
          } while (t->tokno != '\n' && (t = tok_get()));
          return;

      case 0:
          /* Hit EOF, punt. */
          put_ch('#');
          return;
      }
    }
}

/* backslash_newline - fix up things after reading a backslash-newline pair */

static int backslash_newline()
{
    register int c;

    nl_compensate++;
    return (INPUT(c));
}

/* tok_get - get next token */

static int last_tokno = '\n';

struct token *tok_get()
{
    register struct token *t;
    register int c;
    int     d;

    /*
     * Get one from the pool and fill it in. The loop is here in case we hit
     * a preprocessor control line, which happens in a minority of all cases.
     * We update the token input path and line info *after* backslash-newline
     * processing or the newline compensation would go wrong.
     */

    t = tok_alloc();

    for (;;) {
      if ((INPUT(c)) == EOF) {
          tok_free(t);
          return (0);
      } else if ((t->line = in_line, t->path = in_path), !isascii(c)) {
          t->vstr->str[0] = c;
          t->vstr->str[1] = 0;
          t->tokno = TOK_OTHER;
          break;
      } else if (ISSPACE(c)) {
          COLLECT(t->vstr, c, ISSPACE(c));
          t->tokno = TOK_WSPACE;
          break;
      } else if (ISALPHA(c)) {
          COLLECT(t->vstr, c, ISALNUM(c));
          t->tokno = TOK_WORD;
          break;
      } else if (isdigit(c)) {
          COLLECT(t->vstr, c, isdigit(c));
          t->tokno = TOK_NUMBER;
          break;
      } else if (c == '"' || c == '\'') {
          t->tokno = read_quoted(t->vstr, c);   /* detect missing end quote */
          break;
      } else if (ISDOT(c)) {
          COLLECT(t->vstr, c, ISDOT(c));
          t->tokno = TOK_OTHER;
          break;
      } else if (c == '#' && last_tokno == '\n') {
          do_control();
          continue;
      } else {
          t->vstr->str[0] = c;
          if (c == '\n') {
            in_line++;
            if (nl_compensate > 0) {      /* compensation for bs-nl */
                UNPUT('\n');
                nl_compensate--;
            }
          } else if (c == '/') {
            if ((INPUT(d)) == '*') {
                t->vstr->str[1] = d;      /* comment */
                read_comment(t->vstr);
                t->tokno = TOK_WSPACE;
                break;
            } else {
                if (d != EOF)
                  UNPUT(d);
            }
          } else if (c == '\\') {
            t->vstr->str[1] = (INPUT(c) == EOF ? 0 : c);
            t->vstr->str[2] = 0;
            t->tokno = TOK_OTHER;
            break;
          }
          t->vstr->str[1] = 0;
          t->tokno = c;
          break;
      }
    }
    last_tokno = t->tokno;
    t->end_line = in_line;
    return (t);
}

/* read_quoted - read string or character literal, canonicalize escapes */

static int read_quoted(vs, ch)
register struct vstring *vs;
int     ch;
{
    register char *cp = vs->str;
    register int c;
    int     ret = TOK_OTHER;

    *cp++ = ch;

    /*
     * Clobber the token type in case of a premature newline or EOF. This
     * prevents us from attempting to concatenate string constants with
     * broken ones that have no closing quote.
     */

    while (INPUT(c) != EOF) {
      if (c == '\n') {              /* newline in string */
          UNPUT(c);
          break;
      }
      if (VS_ADDCH(vs, cp, c) == 0)       /* store character */
          fatal("out of memory");
      if (c == ch) {                      /* closing quote */
          ret = c;
          break;
      }
      if (c == '\\') {              /* parse escape sequence */
          if ((INPUT(c)) == EOF) {        /* EOF, punt */
            break;
          } else if (c == 'a') {          /* \a -> audible bell */
#ifdef BELL
            if ((cp = vs_strcpy(vs, cp, BELL)) == 0)
#else
            if ((cp = vs_strcpy(vs, cp, "\007")) == 0)
#endif
                fatal("out of memory");
          } else if (c == 'x') {          /* \xhh -> \nnn */
            cp = read_hex(vs, cp);
          } else if (ISOCTAL(c) && ch != '\'') {
            cp = read_octal(vs, cp, c);   /* canonicalize \octal */
          } else {
            if (VS_ADDCH(vs, cp, c) == 0) /* \other: leave alone */
                fatal("out of memory");
          }
      }
    }
    *cp = 0;
    return (ret);
}

/* read_comment - stuff a whole comment into one huge token */

static void read_comment(vs)
register struct vstring *vs;
{
    register char *cp = vs->str + 2;      /* skip slash star */
    register int c;
    register int d;

    while (INPUT(c) != EOF) {
      if (VS_ADDCH(vs, cp, c) == 0)
          fatal("out of memory");
      if (c == '*') {
          if ((INPUT(d)) == '/') {
            if (VS_ADDCH(vs, cp, d) == 0)
                fatal("out of memory");
            break;
          } else {
            if (d != EOF)
                UNPUT(d);
          }
      } else if (c == '\n') {
          in_line++;
      } else if (c == '\\') {
          if ((INPUT(d)) != EOF && VS_ADDCH(vs, cp, d) == 0)
            fatal("out of memory");
      }
    }
    *cp = 0;
}

/* read_hex - rewrite hex escape to three-digit octal escape */

static char *read_hex(vs, cp)
struct vstring *vs;
register char *cp;
{
    register int c;
    register int i;
    char    buf[BUFSIZ];
    int     len;
    unsigned val;

    /*
     * Eat up all subsequent hex digits. Complain later when there are too
     * many.
     */

    for (i = 0; i < sizeof(buf) && (INPUT(c) != EOF) && ISHEX(c); i++)
      buf[i] = c;
    buf[i] = 0;

    if (i < sizeof(buf) && c)
      UNPUT(c);

    /*
     * Convert hex form to three-digit octal form. The three-digit form is
     * used so that strings can be concatenated without problems. Complain
     * about malformed input; truncate the result to at most three octal
     * digits.
     */

    if (i == 0) {
      error("\\x escape sequence without hexadecimal digits");
      if (VS_ADDCH(vs, cp, 'x') == 0)
          fatal("out of memory");
    } else {
      (void) sscanf(buf, "%x", &val);
      sprintf(buf, "%03o", val);
      if ((len = strlen(buf)) > 3)
          error("\\x escape sequence yields non-character value");
      if ((cp = vs_strcpy(vs, cp, buf + len - 3)) == 0)
          fatal("out of memory");
    }
    return (cp);
}

/* read_octal - convert octal escape to three-digit format */

static char obuf[] = "00123";

static char *read_octal(vs, cp, c)
register struct vstring *vs;
register char *cp;
register int c;
{
    register int i;

#define     buf_input (obuf + 2)

    /* Eat up at most three octal digits. */

    buf_input[0] = c;
    for (i = 1; i < 3 && (INPUT(c) != EOF) && ISOCTAL(c); i++)
      buf_input[i] = c;
    buf_input[i] = 0;

    if (i < 3 && c)
      UNPUT(c);

    /*
     * Leave three-digit octal escapes alone. Convert one-digit and two-digit
     * octal escapes to three-digit form by prefixing them with a suitable
     * number of '0' characters. This is done so that strings can be
     * concatenated without problems.
     */

    if ((cp = vs_strcpy(vs, cp, buf_input + i - 3)) == 0)
      fatal("out of memory");
    return (cp);
}

/* put_nl - emit newline and adjust output line count */

void    put_nl()
{
    put_ch('\n');
    out_line++;
}

/* fix_line_control - to adjust path and/or line count info in output */

static void fix_line_control(path, line)
register char *path;
register int line;
{

    /*
     * This function is called sporadically, so it should not be a problem
     * that we repeat some of the tests that preceded this function call.
     * 
     * Emit a newline if we are not at the start of a line.
     * 
     * If we switch files, or if we jump backwards, emit line control. If we
     * jump forward, emit the proper number of newlines to compensate.
     */

    if (last_ch != '\n')                  /* terminate open line */
      put_nl();
    if (path != out_path || line < out_line) {  /* file switch or back jump */
      printf("# %d %s\n", out_line = line, out_path = path);
      last_ch = '\n';
    } else {                              /* forward jump */
      while (line > out_line)
          put_nl();
    }
}

/* tok_show_ch - output single-character token (not newline) */

void    tok_show_ch(t)
register struct token *t;
{
    CHECK_LINE_CONTROL(t->path, t->line);

    put_ch(t->tokno);                     /* show token contents */
}

/* tok_show - output (possibly composite) token */

void    tok_show(t)
register struct token *t;
{
    register struct token *p;

    if (t->tokno == TOK_LIST) {
      register struct token *s;

      /*
       * This branch is completely in terms of tok_xxx() primitives, so
       * there is no need to check the line control information.
       */

      for (s = t->head; s; s = s->next) {
          tok_show_ch(s);                 /* '(' or ',' or ')' */
          for (p = s->head; p; p = p->next)
            tok_show(p);                  /* show list element */
      }
    } else {
      register char *cp = t->vstr->str;

      /*
       * Measurements show that it pays off to give special treatment to
       * single-character tokens. Note that both types of token may cause a
       * change of output line number.
       */

      CHECK_LINE_CONTROL(t->path, t->line);
      if (cp[1] == 0) {
          put_ch(*cp);              /* single-character token */
      } else {
          put_str(cp);              /* multi_character token */
      }
      out_line = t->end_line;             /* may span multiple lines */
      for (p = t->head; p; p = p->next)
          tok_show(p);              /* trailing blanks */
    }
}

Generated by  Doxygen 1.6.0   Back to index