/*
 *  K&R Oroogu Compiler
 *
 *  Version 0.301
 *
 *  Implemented by Georg Kraml & Angelika Riemer
 * 
 *  Notes:
 *
 *    1. This is a straightforward implementation of a syntax-driven
 *       compiler front end performing predictive recursive-descent
 *       parsing of a simple L-attributed grammar. Instead of intermediate
 *       code, C statements are emitted. There is no back end.
 *
 *    2. The implemtation is paranoid. We check the return value of
 *       *everything* that doesn't return void anyway, with the exception
 *       of fprintf(stderr, ...) (where it simply wouldn't make sense)
 *       and fgetc(...) (because EOF is handled specifically anyway).
 *
 *  Further information is available from <http://www.purists.org/oroogu/>.
 *  Please send bug reports <georg@purists.org>.
 *
 */

#include <ctype.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _WIN32
	#define UNLINK _unlink
#else
	extern int unlink(const char *);
	#define UNLINK unlink
#endif

/*
 *  Non-terminal tokens
 */
#define NUMBER	256
#define QUEUE	257
#define RANGE	258
#define EXP	259
#define STRING  260
#define BOGUS	261

/* 
 *  Buffer sizes
 */
#define IDLEN	8
#define PATHLEN 260
#define STRLEN  4096
 
/*
 *  Miscellaneous constants
 */
#define CANTHAPPEN "can't happen"
#define BANNER	"K&R Oroogu compiler version 0.30 (" ## __DATE__ ## ")"
#define HEADER "orsp.h"

/*
 *  Some prototypes to silence the compiler
 */
int expr(void);
void start(void);
void body(void);
void closefile(void);
int power(void);
int factor(void);
void lval(void);
void rval(void);
int rval2(void);
void action(void);
void atom(void);
int term(void);

/*
 *  Token description structure
 */
struct token
{
        int token, val;
	char *strlit;
};

/*
 *  Lookup table entry structure
 */
struct id
{
	char lexeme[IDLEN + 1];
	struct id *next;
};

/*  
 *  Source file description structure
 */
struct source
{
	char path[PATHLEN];
	FILE *f;
	int line;
	struct source *next;
};

/*
 *  Structure keeping track of loop nesting depth
 */
struct loopline
{
	int line;
	struct loopline *next;
};

/*
 *  Global variables
 */
char *cmdname, outpath[PATHLEN];
struct id *table = NULL;
struct source *files;
struct token t;
struct loopline *loop = NULL;
int level, maxlevel, maxqueues;
FILE *output;

/*
 *  error: report error and abort compilation.
 *
 *  Parameters:
 *
 *    fmt         Format string, xxprintf-style.
 *
 *    ...         Data referred to by the format string.
 */
void
error(const char *fmt, ...)
{
	va_list ap;
	char *n = cmdname;
	struct loopline *tmp;

	/* print error message */
	while (*n) n++;
	while (n != cmdname && *(n - 1) != '/' && *(n - 1) != '\\') {
		n--;
	}
	fprintf(stderr, "%s: ", n);
	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);
	fprintf(stderr, ".\n");

	/* print nesting information, if available */
	if (loop) {
		fprintf(stderr, "%s: loops entered in line(s)", n);
		while (loop) {
			fprintf(stderr, " %d", loop->line);
			tmp = loop;
			loop = loop->next;
			free(tmp);
		}
		fprintf(stderr, ".\n");
	}

	/* unlink output file */
	if (output) {
		if (fclose(output)) {
			output = NULL;
			error("cannot close output file: %s\n", outpath);
		}
		output = NULL;
		if (UNLINK(outpath)) {
			error("cannot unlink output file: %s\n", outpath);
		}
	}

	/* abort */
	exit(1);	
}

/*
 *  openfile: opens a source file for processing.
 *
 *  Parameters:
 *
 *    path         Source file path
 *
 *  Return values:
 *
 *    1            The source file has been opened, a record pertaining
 *                 to the file has been added to the files stack.
 *
 *    0            Error opening file.
 */
int
openfile(const char *path)
{
	struct source *tmp = files;

	if ((files = malloc(sizeof(struct source))) == NULL) {
		error("internal error: out of memory");
	}
	if ((files->f = fopen(path, "r")) == NULL) {
		return 0;
	}
	strcpy(files->path, path);
	files->line = 1;
	files->next = tmp;
	return 1;
}

/*
 *  closefile: closes source file.
 */
void
closefile()
{
	struct source *tmp = files;

	files = files->next;
	if (fclose(tmp->f)) {
		error("cannot close input file: %s", tmp->path);
	}
	free(tmp);
}

/*
 *  lookup: looks up identifier in the symbol table, adds it if necessary.
 *
 *  Parameters:
 *
 *    id          Identifier to looked for in the symbol table.
 *
 *  Returns:
 *  
 *    Function returns index of identifier in the symbol table. The index
 *    of the first identifier encountered is 1, not 0, since ID 0 is
 *    reserved for the "temporary" queue used internally by the runtime
 *    support package.
 */
int
lookup(const char *id)
{
	int ix = 1;
	struct id **tmp = &table;

	while (*tmp) {
		if (!strcmp((*tmp)->lexeme, id)) return ix;
		ix ++;
		tmp = & (*tmp)->next;
	}
	if ((*tmp = malloc(sizeof(struct id))) == NULL) {
		error("internal error: out of memory");
	}
	strcpy((*tmp)->lexeme, id);
	(*tmp)->next = NULL;
	return ix;
}

/*
 *  pushback: wrapper for ungetc
 *
 *  Parameters:
 *
 *  c           Character to be pushed back into the input stream
 */
void
pushback(int c)
{
	if (ungetc(c, files->f) != EOF) return;
	error("cannot unread character: %s", files->path);
}

/*
 *  lexer: identify and return the next token in the input stream.
 *
 *  Parameters:
 *
 *    t           Pointer to token description structure.
 *
 *  Return values:
 *
 *    Function just returns its argument pointer.
 */
struct token *
lexer(struct token *t)
{
        int c, comment = 0, ix;
	static char buf[STRLEN + 1];

	while (1) {
		if (!files) {
			t->token = EOF;
			return t;
		}
	        c = fgetc(files->f);

		/* EOF? */
		if (c == EOF) {
			closefile();
			comment = 0;
			continue;
		}

		/* line break */
		if (c == '\n') {
			files->line ++;
			comment = 0;
			continue;
		}

		/* skip rest of function if in comment */
		if (comment) {
			continue;
		}

		/* comment */
		if (c == '#') {
			comment = 1;
		}

		/* include directive */
		else if (c == '$') {
			memset(buf, '\0', PATHLEN + 1);
			ix = 0;
			while (isspace(c = getc(files->f))) {
				/* strip out whitespace */	
			}
			while (!isspace(c)) {
				if (ix == PATHLEN) {
					error("%s: line %d: "
						"include file path too long",
						files->path, files->line);
				}
				buf[ix++] = (char)c;
				c = fgetc(files->f);
			}
			if (c != EOF) {
				pushback(c);
			}
			if (!ix) {
				error("%s: line %d: "
					"include file path expected: %s",
					files->path, files->line);
			}
			if (!openfile(buf)) {
				error("%s: line %d: "
					"cannot open include file: %s",
					files->path, files->line);
			}
		}

		/* number */
		else if (isdigit(c)) {
			pushback(c);
			if (fscanf(files->f, "%d", &t->val) == 0) {
				error("cannot read from file: %s",
					files->path);
			}
			t->token = NUMBER;
			return t;
		}

		/* queue */
		else if (isalpha(c) || c == '_') {
			memset(buf, '\0', IDLEN + 1);
			ix = 0;
			while ((isalnum(c) || c == '_')) {
				if (ix == IDLEN) {
					error("%s: line %d: "
						"identifier too long",
						files->path, files->line);
				}
				buf[ix++] = (char)c;
				c = fgetc(files->f);
			}
			if (c != EOF) {
				pushback(c);
			}
			t->val = lookup(buf);
			t->token = QUEUE;
			return t;
		}

		/* string literal */
		else if (c == '\"') {
			memset(buf, '\0', STRLEN + 1);
			ix = 0;
			c = fgetc(files->f);
			while (c != '\"') {
				if (ix == STRLEN) {
					error("%s: line %d: "
						"string too long",
						files->path, files->line);
				}
				buf[ix++] = (char)c;
				c = fgetc(files->f);
			}
			t->strlit = buf;
			t->token = STRING;
			return t;
		}

		/* range operator */
		else if (c == '.') {
			c = fgetc(files->f);
			if (c == '.') {
				t->token = RANGE;
				return t;
			}
			pushback(c);
			t->token = '.';
			return t;
		}

		/* exponentiation operator */
		else if (c == '*') {
			c = fgetc(files->f);
			if (c == '*') {
				t->token = EXP;
				return t;
			}
			pushback(c);
			t->token = '*';
			return t;
		}

		/* anything else but whitespace */
		else if (!isspace(c)) {
			t->token = c;
			return t;
		}
	}
}

/*
 *  emit: actually generate code
 *
 *  Parameters:
 *
 *    fmt         Format string, xxprintf-style.
 *
 *    ...         Data referred to by the format string.
 */
void
emit(const char *fmt, ...)
{
	va_list ap;
	int i;

	va_start(ap, fmt);
	if (fmt[0] == '\n') {
		if (fprintf(output, "\n\t") < 0) {
			error("cannot write to file: %s", outpath);
		}
		for (i = 0; i < level; i ++) {
			if (fprintf(output, "\t") < 0) {
				error("cannot write to file: %s", outpath);
			}
		}
		fmt ++;
	}
	if (vfprintf(output, fmt, ap) < 0) {
		error("cannot write to file: %s", outpath);
	}
	va_end(ap);
}

/*
 *  matchreq: check and advance lookahead symbol, abort on mismatch
 *
 *  Parameters:
 *
 *    token       Token the lookahead symbol is supposed to match
 *
 *    msg         Error message to be printed in case of mismatch
 */
void
matchreq(int token, const char *msg)
{
	if (token == t.token) {
		lexer(&t);
		return;
	}
	if (files) {
		error("%s: line %d: %s", files->path, files->line, msg);
	}
	else {
		error("unexpected EOF: %s", msg);
	}
}

/*
 *  matchtest: check and advance lookahead symbol
 *
 *  Parameters:
 *
 *    token       Token the lookahead symbol is supposed to match.
 *
 *  Returns:
 *
 *    nonzero     The token did match, the lookahead symbol has been
 *                advanced. The token value of the lookahead symbol
 *                prior to the advancement (i. e., the argument
 *                provided by the caller) is returned.
 *
 *    zero        The token did not match, the lookahead symbol has
 *                not been advanced.
 */
int
matchtest(int token)
{
	if (token == t.token) {
		lexer(&t);
		return token;
	}
	return 0;
}

/*
 *  factor: parsing helper function
 *
 *  Semantic actions:
 *
 *    factor  >  integer "VAL"
 *            |  -integer "-VAL"
 *            |  queue "get(VAL)"
 *            |  < queue "peek(VAL)"
 *            |  ( "(" expr ) ")"
 *
 *    VAL is a synthesized attribute of "integer" and "queue", it holds
 *    either the integer's value or the queue's ID.
 *
 *  Return values:
 *
 *    nonzero     The nonterminal derives an arithmetic expression. If
 *                any of the "get" and "peek" calls made during evaluating
 *                this expression returns a string instead of a number,
 *                an exception must be thrown.
 *
 *    zero        The nonterminal derives a reference to a queue. The
 *                corresponding call to "get" or "peek" may return
 *                a string instead of a number.
 */
int
factor()
{
	int check;

	switch (t.token) {
	case '-':
		matchreq('-', CANTHAPPEN);
		emit("-");

	case NUMBER:
		emit("%d", t.val);
		matchreq(NUMBER, CANTHAPPEN);
		return 1;

	case QUEUE:
		emit("get(%d)", t.val);
		matchreq(QUEUE, CANTHAPPEN);
		return 0;

	case '<':
		matchreq('<', CANTHAPPEN);
		emit("peek(%d)", t.val);
		matchreq(QUEUE, CANTHAPPEN);
		return 0;

	case '(':
		emit("(");
		matchreq('(', CANTHAPPEN);
		check = expr();
		emit(")");
		matchreq(')', "right parenthesis expected after expression");
		return check;
	}
	matchreq(BOGUS, "operand expected");
	return 0;
}

/*
 *  power: parsing helper function.
 *
 *  Semantic actions:
 *
 *    power   ->  "exp(" factor ", " power2 ")"
 *    power2  ->  ** ", " power
 *             |  e "1"
 *
 *  Return values:
 *
 *    See "factor".
 */
int
power()
{
	int check;

	emit("exp(");
	check = factor();
	emit(", ");
	if (matchtest(EXP)) {
		power();
		check = 1;
	}
	else {
		emit("1");
	}
	emit(")");
	return check;
}

/*
 *  term: parsing helper function.
 *
 *  Semantic actions: 
 *
 *    term    ->  power term2
 *    term2   ->  * " * " power term2 
 *             |  / " / check(" power ")" term2 
 *             |  % " % check(" power ")" term2
 *             |  e  
 *
 *  Return values:
 *
 *    See "factor".
 */
int
term()
{
	int tmp, check;

	check = power();
	while ((tmp = matchtest('*')) != 0	
		|| (tmp = matchtest('/')) != 0
		|| (tmp = matchtest('%')) != 0) {
		check = 1;
		if (tmp == '*') {
			emit(" * ");
			power();
		}
		else {
			emit(" %c assert(", tmp);
			power();
			emit(")");
		}
	}
	return check;
}

/*
 *  expr: parsing helper function.
 *
 *  Semantic actions:
 *
 *    expr    ->  term expr2
 *    expr2   ->  + "+" term expr2 
 *             |  - "-" term expr2 
 *             |  e
 *
 *  Return values:
 *
 *    See "factor".
 */
int
expr()
{
	int tmp, check;

	check = term();
	while ((tmp = matchtest('+')) != 0 
		|| (tmp = matchtest('-')) != 0) {
		check = 1;
		emit(" %c ", tmp);
		term();
	}
	return check;
}

/*
 *  atom: parsing helper function
 *
 *  Semantic actions:
 *
 *    atom    ->  "addtext(" string ")"
 *             |  "range(" expr atom2 ", 1);"
 *    atom2   ->  .. ", " expr 
 *             |  e "0, 0);"
 */
void
atom()
{
	int check;

	if (t.token == STRING) {
		emit("\naddtext(\"%s\");", t.strlit);
		matchreq(STRING, CANTHAPPEN);
		return;
	}
	emit("\nrange(");
	check = expr();
	emit(", ");
	if (matchtest(RANGE)) {
		expr();
		emit(", 2);");
	}
	else {
		emit("0, %d);", check);
	}
}

/*
 *  rval2: parsing helper function
 *
 *  Semantic actions:
 *
 *       ->  queue 
 *             |  ( atom rval3 )
 *    rval3   ->  , atom rval3 
 *             |  e
 *
 *  Additional tasks:
 *
 *    Synthesize the QUEUE attribute and return it to the caller.
 *    See below for details.
 *
 *  Return values:
 *
 *    nonzero     Production "rval2 -> queue" was used. The function
 *                returns the symbol table index of the queue in
 *                question; use this value when emitting the
 *                storage instruction.
 *
 *    zero        Production "rval2 -> ( atom rval3 )" was used. Use
 *                the temporary queue's index (zero) when emitting the
 *                storage instruction.
 */
int
rval2()
{
	int queue;

	if (!matchtest('(')) {
		queue = t.val;
		matchreq(QUEUE, "variable or expression expected");
		return queue;
	}
	atom();
	while (matchtest(',')) {
		atom();
	}
	matchreq(')', "right parenthesis expected after expression");
	return 0;
}

/*
 *  rval: parsing helper function
 *
 *  Semantic actions:
 *
 *    rval    ->  = rval2 "\nassign(LEVEL, QUEUE);"
 *             |  / rval2 "\nappend(LEVEL, QUEUE);"
 *             |  \ rval2 "\nremove(LEVEL, QUEUE);"
 *
 *    QUEUE is an attribute synthesized by the "rval2" node. If a single
 *    queue is used on the storage operator's right-hand side, as in
 *
 *      foo = bar
 *
 *    then QUEUE will be the ID of "bar". If an expression on the
 *    operator's right-hand side, as in
 *
 *      foo = (bar, baz)
 *
 *    then "rval2" emits statements adding the values the expression 
 *    yields to a "temporary" queue (ID 0), and QUEUE will be 0.
 *
 *    LEVEL is an inherited attribute telling us how deeply this block
 *    is nested; i. e., how many surrounding loops there are and in which
 *    slot of the "loop" array the IDs of the loop header queues can 
 *    be found. 
 *
 *    The reason we explicitly store the IDs somehwere (instead
 *    of just handing them to the assign/remove/append function as
 *    arguments) is that we may need to be able to backreference them
 *    later if the statement is a loop header.
 */
void
rval()
{
	int ix;

	if (matchtest('=')) {
		ix = rval2();
		emit("\nassign(%d, %d);", level, ix);
		return;
	}
	if (matchtest('/')) {
		ix = rval2();
		emit("\nappend(%d, %d);", level, ix);
		return;
	}
	if (matchtest('\\')) {
		ix = rval2();
		emit("\ndiff(%d, %d);", level, ix);
		return;
	}
	matchreq(BOGUS, "storage operator expected");
}

/*
 *  body: parsing helper function
 *
 *  Semantic actions:
 *
 *    body    ->  ( "while(loop(LEVEL)) {" start "}" )
 *
 *    For an explanation of LEVEL refer to the "rval" function.
 *
 *  Additional tasks:
 *
 *    Make sure the "maxlevel" variable contains the maximum nesting
 *    depth of any statement in the program. We need this information 
 *    when allocating the "loop" stack at runtime.
 * 
 *    We could think of "maxlevel" as a synthesized attribute of the
 *    grammar's "start" symbol.
 */
void
body()
{
	struct loopline *tmp = loop;
	
	if ((loop = malloc(sizeof(struct loopline))) == NULL) {
		error("internal compiler error: out of memory");
	}
	loop->line = files->line;
	loop->next = tmp;
	
	emit ("\nwhile (loop(%d)) {", level);
	if (++ level > maxlevel) maxlevel ++;
	start();
	level --;
	emit ("\n}");

	free(loop);
	loop = tmp;
}
 
/*
 *  action: parsing helper function
 *
 *  Semantic actions:
 *
 *    action  -> body 
 *             | rval action2
 *    action2 -> body 
 *             | e
 */
void 
action()
{
	if (matchtest('(')) {
		body();
	} 
	else {
		rval();
		if (matchtest('(')) {
			body();
		}
	}
}

/*
 *  lval: parsing helper function
 *
 *  Semantic actions:
 *
 *    lval    ->  "\nlval(LEVEL, " queue lval2
 *    lval2   ->  , ", "  queue lval2 
 *	       |  e ", -1);"
 *
 *    For an explanation of LEVEL refer to the "rval" function.
 * 
 *  Additional tasks:
 *
 *    Make sure the global variable maxqueues contains the maximum number
 *    of queues used on any storage operator's left-hand side. We need
 *    this to determine the proper width of the "loop" array.
 *
 *    The "loop" array serves as a stack holding, for each loop currently
 *    alive, the IDs of its loop header queues.
 */
void 
lval()
{
	int queues = 1;

	emit("\nlval(%d, %d, ", level, t.val);
	matchreq(QUEUE, "variable expected at begin of statement");
	while (matchtest(',')) {
		emit("%d, ", t.val);
		matchreq(QUEUE, "variable expected after separator");
		queues ++;
	}
	emit("-1);");
	if (queues > maxqueues) maxqueues = queues;
}

/*
 *  start: parse Oroogu program and emit C code
 *  
 *  Semantic actions for this node:
 *
 *    start -> lval action start | e
 */
void
start()
{
	while ((level != 0 || !matchtest(EOF)) 
		&& (level == 0 || !matchtest(')'))) {
		lval();
		action();
	}
}

/*
 *  Main function
 */
int
main(int argc, char **argv)
{
	int i = 0, qnum;
	struct id *tmp;

	cmdname = argv[0];
	while (++i < argc) {

		/* open files */
		if (!openfile(argv[i])) {
			error("cannot open input file: %s zz", argv[i]);
		}
		sprintf(outpath, "%s.c", argv[i]);
		if ((output = fopen(outpath, "w")) == NULL) {
			closefile();
			error("cannot create output file: %s", outpath);
		}

		/* emit compiler banner and #include directive */
		emit("/*\n *  Created by the %s\n */\n\n"
			"#include \"%s\"\n\n",
			BANNER, HEADER);

		/* emit "oroogu" function */
		emit("void\noroogu()\n{");
		lexer(&t);
		level = maxlevel = maxqueues = 0;
		start();
		emit(" \n}\n\n");

		/* emit "lexemes" function, reset symbol table */
		emit("void\nlexemes()\n{\n");
		qnum = 1;
		while (table) {
			tmp = table;
			emit("\tlexeme(%d, \"%s\");\n",
				qnum ++, table->lexeme); 
			table = table->next;
			free(tmp);
		}
		emit("}\n\n");

		/* emit "main" function */
		emit("int\nmain(int argc, char **argv)\n{\n"
			"\tstartup(%d, %d, %d, argv[0]);\n"
			"\tlexemes();\n"
			"\toroogu();\n"
			"\tcleanup();\n\treturn 0;\n}\n",
			maxlevel + 1, maxqueues + 1, qnum);

		/* close output file */
		if (fclose(output)) {
			error("cannot flush output file: %s", outpath);
		}
		output = NULL;
	}
	return 0;
}

