You could also do what I have done in my assembler, I use Flex and Bison.
This makes life so much easier. A lex file sets the tokens and the bison sets the grammar.
here is my lex and bison files for my 6502/65C02 macro assembler with scripting.
LexCode:
%option case-insensitive yylineno noyywrap
%{
#pragma warning(disable:4996)
#pragma warning(disable:6011)
#pragma warning(disable:6387)
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include "opcodes.h"
#include "pasm64.h"
#include "pasm64.tab.h"
#include "symbol.h"
#include "node.h"
#include "str.h"
int inDoLoop = 0;
int inMacroDef = 0;
#define OP(op) { yylval.iValue = _ ## op; return OPCODE; }
#define OP_REL(op) { yylval.iValue = _ ## op ## 0 + yytext[3] - '0'; return OPCODE; }
#define OP_ILLEGAL(op) { if (AllowIllegalOpCpodes) OP(op) else REJECT; }
#define OP_65C02(op) { if (CPUMode == cpu_65C02) OP(op) else REJECT; }
#define OP_65C02_REL(op) { if (CPUMode == cpu_65C02) OP_REL(op) else REJECT; }
#define INT(off, base) { yylval.iValue = (int) strtol (yytext + off, NULL, base); \
return INTEGER; \
}
#define INT_1BYTE { int outlen; \
char* tmpStr = SantizeString(yytext, &outlen); \
yylval.iValue = (int)tmpStr[1]; \
free(tmpStr); \
return INTEGER; \
}
%}
%x C_COMMENT
ES (\\(['"\?\\abfnrtv]|[0-7]{1,3}|x[a-fA-F0-9]+))
LOGOR [|][|]
BITOR [|]
LOGAND [&][&]
BITAND [&]
STRVALUE (\"([^"\\\n]|{ES})*\")+
%%
"ORA" OP(ora)
"AND" OP(and)
"EOR" OP(eor)
"ADC" OP(adc)
"SBC" OP(sbc)
"CMP" OP(cmp)
"CPX" OP(cpx)
"CPY" OP(cpy)
"DEC" OP(dec)
"DEC"[ /t]*"A" OP(dec)
"DEX" OP(dex)
"DEY" OP(dey)
"INC" OP(inc)
"INC"[ /t]*"A" OP(inc)
"INX" OP(inx)
"INY" OP(iny)
"ASL" OP(asl)
"ASL"[ /t]*"A" OP(asl)
"ROL" OP(rol)
"ROL"[ /t]*"A" OP(rol)
"LSR" OP(lsr)
"LSR"[ /t]*"A" OP(lsr)
"ROR" OP(ror)
"ROR"[ /t]*"A" OP(ror)
"LDA" OP(lda)
"STA" OP(sta)
"LDX" OP(ldx)
"STX" OP(stx)
"LDY" OP(ldy)
"STY" OP(sty)
"TAX" OP(tax)
"TXA" OP(txa)
"TAY" OP(tay)
"TYA" OP(tya)
"TSX" OP(tsx)
"TXS" OP(txs)
"PLA" OP(pla)
"PHA" OP(pha)
"PLP" OP(plp)
"PHP" OP(php)
"BPL" OP(bpl)
"BMI" OP(bmi)
"BVC" OP(bvc)
"BVS" OP(bvs)
"BCC" OP(bcc)
"BCS" OP(bcs)
"BNE" OP(bne)
"BEQ" OP(beq)
"BRK" OP(brk)
"RTI" OP(rti)
"JSR" OP(jsr)
"RTS" OP(rts)
"JMP" OP(jmp)
"BIT" OP(bit)
"CLC" OP(clc)
"SEC" OP(sec)
"CLD" OP(cld)
"SED" OP(sed)
"CLI" OP(cli)
"SEI" OP(sei)
"CLV" OP(clv)
"NOP" OP(nop)
"SLO" OP_ILLEGAL(slo)
"RLA" OP_ILLEGAL(rla)
"SRE" OP_ILLEGAL(sre)
"RRA" OP_ILLEGAL(rra)
"SAX" OP_ILLEGAL(sax)
"LAX" OP_ILLEGAL(lax)
"DCP" OP_ILLEGAL(dcp)
"ISC" OP_ILLEGAL(isc)
"ANC" OP_ILLEGAL(anc)
"ANC2" OP_ILLEGAL(anc2)
"ALR" OP_ILLEGAL(alr)
"ARR" OP_ILLEGAL(arr)
"XAA" OP_ILLEGAL(xaa)
"LAX2" OP_ILLEGAL(lax2)
"AXS" OP_ILLEGAL(axs)
"SBC2" OP_ILLEGAL(sbc2)
"AHX" OP_ILLEGAL(ahx)
"SHY" OP_ILLEGAL(shy)
"SHX" OP_ILLEGAL(shx)
"TAS" OP_ILLEGAL(tas)
"LAS" OP_ILLEGAL(las)
"BRA" OP_65C02(bra)
"PHX" OP_65C02(phx)
"PHY" OP_65C02(phy)
"PLX" OP_65C02(plx)
"PLY" OP_65C02(ply)
"STZ" OP_65C02(stz)
"TRB" OP_65C02(trb)
"TSB" OP_65C02(tsb)
"STP" OP_65C02(stp)
"WAI" OP_65C02(wai)
"BBR"[0-7] OP_65C02_REL(bbr)
"BBS"[0-7] OP_65C02_REL(bbs)
"RMB"[0-7] OP_65C02_REL(rmb)
"SMB"[0-7] OP_65C02_REL(smb)
".BYTE" return BYTE;
".DB" return BYTE;
".DCB" return BYTE;
".WORD" return WORD;
".DW" return WORD;
".DCW" return WORD;
".DS" return DS;
".EQU" return EQU;
"NOT" return NOT;
">=" return GE;
"<=" return LE;
"==" return EQ;
"!=" return NE;
"<>" return NE;
"<<" return SHIFT_LEFT;
">>" return SHIFT_RIGHT;
{LOGOR} return OR;
{BITOR} return BIT_OR;
{LOGAND} return AND;
{BITAND} return BIT_AND;
".REPEAT" return REPEAT;
".UNTIL" return UNTIL;
".END" return END;
".ENDIF" return ENDIF;
".IF" return IF;
".ELSE" return ELSE;
".PRINT" return PRINT;
"\?" return PRINT;
".PRINTALL" return PRINTALL;
"\?\?" return PRINTALL;
".FOR" return FOR;
".NEXT" return NEXT;
".STEP" return STEP;
".TO" return TO;
".DOWNTO" return DOWNTO;
".STR" return STR;
".STRING" return STR;
".ORG" return ORG;
".SECTION" return SECTION;
".ENDSECTION" return ENDSECTION;
".SECT" return SECTION;
".ENDS" return ENDSECTION;
".WHILE" { if (inDoLoop > 0) { inDoLoop--; return ENDDO; } else { inDoLoop = 0; return WHILE; }}
".WEND" { return WEND; }
".DO" { inDoLoop++; return DO; }
".MACRO" { inMacroDef++; return MACRO; }
".MAC" { inMacroDef++; return MACRO; }
".ENDMACRO" { inMacroDef--; return ENDMACRO; }
".ENDM" { inMacroDef--; return ENDMACRO; }
".REGX" { return REGX; }
".REGY" { return REGY; }
".VAR" { return VAR; }
".6502"[ /t]*"ON" { CPUMode = cpu_6502; }
".65C02"[ /t]*"ON" { CPUMode = cpu_65C02; }
".ILLEGAL"[ /t]*"ON" { AllowIllegalOpCpodes = TRUE; }
".ILLEGAL"[ /t]*"OFF" { AllowIllegalOpCpodes = FALSE; }
".WARN"[ /t]*"ON" { NoWarnings = FALSE; }
".WARN"[ /t]*"OFF" { NoWarnings = TRUE; }
".C64" { OutFileFormat = c64; }
[@][0-9]+ {
if (!inMacroDef)
REJECT;
yylval.strValue = Strdup(yytext); return SYMBOL;
}
[\'].[\'] INT_1BYTE
[\'][\\].[\'] INT_1BYTE
$[0-9A-Fa-f]* INT(1, 16)
[\'][\\]x[0-9A-Fa-f]+[\'] INT(3, 16)
[0-9]* INT(0, 10)
%[0-1]* INT(1, 2)
[\'][\\][0-7]{3}[\'] INT(2, 8)
{STRVALUE} { yylval.strValue = Strdup(yytext); return STRING_LITERAL; }
"*"[ /t]*[=] { unput('='); return PCASSIGN; }
"*"[ /t]*".EQU" { unput('='); return PCASSIGN; }
"X" { return 'X'; }
"Y" { return 'Y'; }
[A-Za-z_][A-Za-z0-9_.]*:? { yylval.strValue = Strdup(yytext); return SYMBOL; }
"/*" { BEGIN(C_COMMENT); }
<C_COMMENT>"*/" { BEGIN(INITIAL); }
<C_COMMENT>.|[\n] { /* ignore comments */ }
;[^\n]* { /* ignore comments */ }
[/][/][^\n]* { /* ignore comments */ }
"~" { return '~'; }
"^" { return '^'; }
[-<>=+*/#,();=\n] { return *yytext; }
[ \t]+ { /* ignore white space */ }
. { yyerror("syntax error"); }
%%
BisonCode:
%no-lines
%{
// ***********************************************************************
// Author : Paul Baxter
// Created : 02-23-2015
//
// copyright (c) 2015 Paul Baxter
//
// Last Modified By : Paul
// Last Modified On : 11-4-2015
// ***********************************************************************
#pragma warning(disable:4065)
#pragma warning(disable:4996)
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <limits.h>
#include <errno.h>
#include "pasm64.h"
#include "opcodes.h"
#include "node.h"
#include "genlist.h"
#include "pasm64.tab.h"
#include "symbol.h"
#include "str.h"
%}
%union
{
int iValue; /* integer value */
char* strValue; /* string */
char* sIndex; /* symbol table pointer */
parseNode *nPtr; /* node pointer */
};
%token <iValue> INTEGER
%token <iValue> OPCODE
%token <sIndex> SYMBOL MACROPARAM
%token <strValue> STRING_LITERAL
%token WHILE ENDDO REPEAT UNTIL IF PRINT PRINTALL EQU ORG PCASSIGN
%token END DO MACRO ENDMACRO ENDIF WEND STATEMENT EXPRLIST STR
%token FOR NEXT TO DOWNTO STEP NOT
%token BYTE WORD LOBYTE HIBYTE DS
%token REGX REGY VAR
%token SECTION ENDSECTION
%nonassoc ELSE UMINUS '~'
%left SHIFT_LEFT SHIFT_RIGHT
%left OR AND GE LE EQ NE '>' '<'
%left BIT_OR BIT_AND '^'
%left '+' '-'
%left '*' '/'
%type <nPtr> stmt_list stmt
%type <nPtr> opcode regloopexpr
%type <nPtr> macrodef macrocall expr_list symbol_list
%type <nPtr> symbol_assign symbol_value var_def pc_assign
%type <nPtr> expr subexpr ifexpr loopexpr
%type <nPtr> section endsection
%%
program
: program stmt { ex($2); }
| /* NULL */
;
stmt
: opcode '\n' { $$ = $1; }
| symbol_value '\n' { $$ = $1; }
| symbol_value opcode '\n' { ex($1); $$ = $2; }
| symbol_assign '\n' { $$ = $1; }
| pc_assign '\n' { $$ = $1; }
| ifexpr '\n' { $$ = $1; }
| loopexpr '\n' { $$ = $1; }
| regloopexpr '\n' { $$ = $1; }
| macrodef '\n' { $$ = $1; }
| macrocall '\n' { $$ = $1; }
| section '\n' { $$ = $1; }
| endsection '\n' { $$ = $1; }
| var_def '\n' { $$ = $1; }
| '\n' { $$ = opr(STATEMENT, 0); }
;
stmt_list
: stmt { $$ = $1; }
| stmt_list stmt { $$ = opr(STATEMENT, 2, $1, $2); }
;
section
: SECTION SYMBOL { $$ = opr(SECTION, 1, id($2)); }
;
endsection
: ENDSECTION { $$ = opr(ENDSECTION, 0); }
;
ifexpr
: IF subexpr stmt_list ELSE stmt_list ENDIF { $$ = opr(IF, 3, $2, $3, $5); }
| IF subexpr stmt_list ENDIF { $$ = opr(IF, 2, $2, $3); }
;
loopexpr
: REPEAT '\n' stmt_list UNTIL subexpr { $$ = opr(REPEAT, 2, $3, $5); }
| DO stmt_list ENDDO subexpr { $$ = opr(DO, 2, $2, $4); }
| WHILE subexpr '\n' stmt_list WEND { $$ = opr(WHILE, 2, $2, $4); }
| FOR symbol_assign TO subexpr '\n' stmt_list NEXT SYMBOL { $$ = opr(FOR, 4, $2, $4, $6, id($8)); }
| FOR symbol_assign TO subexpr STEP subexpr '\n' stmt_list NEXT SYMBOL { $$ = opr(FOR, 5, $2, $4, $8, id($10), $6); }
;
regloopexpr
: FOR REGX '=' subexpr TO subexpr '\n' stmt_list NEXT 'X' { $$ = opr(REGX, 4, $4, $6, $8, con(1, 0)); }
| FOR REGX '=' subexpr DOWNTO subexpr '\n' stmt_list NEXT 'X' { $$ = opr(REGX, 4, $4, $6, $8, con(-1,0)); }
| FOR REGY '=' subexpr TO subexpr '\n' stmt_list NEXT 'Y' { $$ = opr(REGY, 4, $4, $6, $8, con(1,0)); }
| FOR REGY '=' subexpr DOWNTO subexpr '\n' stmt_list NEXT 'Y' { $$ = opr(REGY, 4, $4, $6, $8, con(-1,0)); }
;
expr_list
: subexpr { $$ = opr(EXPRLIST, 1, $1); }
| STRING_LITERAL { $$ = opr(EXPRLIST, 1, str($1)); }
| expr_list ',' subexpr { $$ = opr(EXPRLIST, 2, $1, $3); }
| expr_list ',' STRING_LITERAL { $$ = opr(EXPRLIST, 2, $1, str($3)); }
;
macrodef
: MACRO SYMBOL stmt_list ENDMACRO { $$ = opr(MACRO, 2, macroid($2), $3); }
;
macrocall
: SYMBOL expr_list { $$ = macroex($1, $2); }
;
symbol_list
: SYMBOL { $$ = opr(EXPRLIST, 1, id($1)); }
| symbol_list ',' SYMBOL { $$ = opr(EXPRLIST, 2, $1, id($3)); }
| SYMBOL '=' subexpr { $$ = opr(EXPRLIST, 3, id($1), $3); }
| symbol_list ',' SYMBOL '=' subexpr{ $$ = opr(EXPRLIST, 3, $1, id($3), $5); }
;
var_def
: VAR symbol_list { $$ = opr(VAR, 1, $2); }
;
symbol_assign
: SYMBOL '=' subexpr { $$ = opr('=', 2, id($1), $3); }
| SYMBOL EQU subexpr { $$ = opr(EQU, 2, id($1), $3); }
;
pc_assign
: PCASSIGN '=' subexpr { $$ = opr(PCASSIGN, 1, $3); }
| PCASSIGN EQU subexpr { $$ = opr(PCASSIGN, 2, $3); }
;
symbol_value
: SYMBOL {
SymbolTablePtr sym = LookUpSymbol($1);
if (sym && sym->ismacroname)
{
$$ = macroex($1, NULL);
}
else
{
$$ = opr('=', 2, id($1), con(PC, TRUE));
}
}
;
opcode
: OPCODE { $$ = opcode($1, i, 0); }
| OPCODE '#' subexpr { $$ = opcode($1, I, 1, $3); }
| OPCODE expr { $$ = opcode($1, a, 1, $2); }
| OPCODE expr ',' 'X' { $$ = opcode($1, ax, 1, $2); }
| OPCODE expr ',' 'Y' { $$ = opcode($1, ay, 1, $2); }
| OPCODE '(' subexpr ')' { $$ = opcode($1, ind, 1, $3); }
| OPCODE '(' subexpr ',' 'X' ')' { $$ = opcode($1, aix, 1, $3); }
| OPCODE '(' subexpr ')' ',' 'Y' { $$ = opcode($1, zpiy, 1, $3); }
| OPCODE expr ',' subexpr { $$ = opcode($1, zr, 2, $2, $4); }
| ORG subexpr { $$ = opr(ORG, 1, $2); }
| DS subexpr { $$ = opr(DS, 1, $2); }
| BYTE expr_list { $$ = data(1, $2); }
| WORD expr_list { $$ = data(2, $2); }
| STR expr_list { $$ = data(0, $2); }
| PRINT { $$ = opr(PRINT, 0); }
| PRINT expr_list { $$ = opr(PRINT, 1, $2); }
| PRINTALL { $$ = opr(PRINTALL, 0); }
| PRINTALL expr_list { $$ = opr(PRINTALL, 1, $2); }
;
subexpr
: expr { $$ = $1; }
| '*' { $$ = con(PC, TRUE); }
| '(' subexpr ')' { $$ = $2; }
;
expr
: INTEGER { $$ = con($1, FALSE); }
| SYMBOL { $$ = id($1); }
| '-' subexpr %prec UMINUS { $$ = opr(UMINUS, 1, $2); }
| '~' subexpr %prec UMINUS { $$ = opr('~', 1, $2); }
| '<' subexpr %prec UMINUS { $$ = opr(LOBYTE, 1, $2); }
| '>' subexpr %prec UMINUS { $$ = opr(HIBYTE, 1, $2); }
| NOT subexpr %prec UMINUS { $$ = opr(NOT, 1, $2); }
| subexpr OR subexpr { $$ = opr(OR, 2, $1, $3); }
| subexpr AND subexpr { $$ = opr(AND, 2, $1, $3); }
| subexpr SHIFT_LEFT subexpr { $$ = opr(SHIFT_LEFT, 2, $1, $3); }
| subexpr SHIFT_RIGHT subexpr { $$ = opr(SHIFT_RIGHT, 2, $1, $3); }
| subexpr '<' subexpr { $$ = opr('<', 2, $1, $3); }
| subexpr '>' subexpr { $$ = opr('>', 2, $1, $3); }
| subexpr GE subexpr { $$ = opr(GE, 2, $1, $3); }
| subexpr LE subexpr { $$ = opr(LE, 2, $1, $3); }
| subexpr NE subexpr { $$ = opr(NE, 2, $1, $3); }
| subexpr EQ subexpr { $$ = opr(EQ, 2, $1, $3); }
| subexpr BIT_AND subexpr { $$ = opr(BIT_AND, 2, $1, $3); }
| subexpr BIT_OR subexpr { $$ = opr(BIT_OR, 2, $1, $3); }
| subexpr '^' subexpr { $$ = opr('^', 2, $1, $3); }
| subexpr '+' subexpr { $$ = opr('+', 2, $1, $3); }
| subexpr '-' subexpr { $$ = opr('-', 2, $1, $3); }
| subexpr '*' subexpr { $$ = opr('*', 2, $1, $3); }
| subexpr '/' subexpr { $$ = opr('/', 2, $1, $3); }
;
%%