Files
Toy/source/toy_lexer.c
2023-01-25 13:54:28 +00:00

350 lines
8.5 KiB
C

#include "toy_lexer.h"
#include "toy_console_colors.h"
#include "toy_keyword_types.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
//static generic utility functions
static void cleanLexer(Toy_Lexer* lexer) {
lexer->source = NULL;
lexer->start = 0;
lexer->current = 0;
lexer->line = 1;
}
static bool isAtEnd(Toy_Lexer* lexer) {
return lexer->source[lexer->current] == '\0';
}
static char peek(Toy_Lexer* lexer) {
return lexer->source[lexer->current];
}
static char peekNext(Toy_Lexer* lexer) {
if (isAtEnd(lexer)) return '\0';
return lexer->source[lexer->current + 1];
}
static char advance(Toy_Lexer* lexer) {
if (isAtEnd(lexer)) {
return '\0';
}
//new line
if (lexer->source[lexer->current] == '\n') {
lexer->line++;
}
lexer->current++;
return lexer->source[lexer->current - 1];
}
static void eatWhitespace(Toy_Lexer* lexer) {
const char c = peek(lexer);
switch(c) {
case ' ':
case '\r':
case '\n':
case '\t':
advance(lexer);
break;
//comments
case '/':
//eat the line
if (peekNext(lexer) == '/') {
while (advance(lexer) != '\n' && !isAtEnd(lexer));
break;
}
//eat the block
if (peekNext(lexer) == '*') {
advance(lexer);
advance(lexer);
while(!(peek(lexer) == '*' && peekNext(lexer) == '/')) advance(lexer);
advance(lexer);
advance(lexer);
break;
}
return;
default:
return;
}
//tail recursion
eatWhitespace(lexer);
}
static bool isDigit(Toy_Lexer* lexer) {
return peek(lexer) >= '0' && peek(lexer) <= '9';
}
static bool isAlpha(Toy_Lexer* lexer) {
return
(peek(lexer) >= 'A' && peek(lexer) <= 'Z') ||
(peek(lexer) >= 'a' && peek(lexer) <= 'z') ||
peek(lexer) == '_'
;
}
static bool match(Toy_Lexer* lexer, char c) {
if (peek(lexer) == c) {
advance(lexer);
return true;
}
return false;
}
//token generators
static Toy_Token makeErrorToken(Toy_Lexer* lexer, char* msg) {
Toy_Token token;
token.type = TOY_TOKEN_ERROR;
token.lexeme = msg;
token.length = strlen(msg);
token.line = lexer->line;
#ifndef TOY_EXPORT
if (Toy_commandLine.verbose) {
printf("err:");
Toy_printToken(&token);
}
#endif
return token;
}
static Toy_Token makeToken(Toy_Lexer* lexer, Toy_TokenType type) {
Toy_Token token;
token.type = type;
token.length = lexer->current - lexer->start;
token.lexeme = &lexer->source[lexer->current - token.length];
token.line = lexer->line;
#ifndef TOY_EXPORT
//BUG #10: this shows TOKEN_EOF twice due to the overarching structure of the program - can't be fixed
if (Toy_commandLine.verbose) {
printf("tok:");
Toy_printToken(&token);
}
#endif
return token;
}
static Toy_Token makeIntegerOrFloat(Toy_Lexer* lexer) {
Toy_TokenType type = TOY_TOKEN_LITERAL_INTEGER; //what am I making?
while(isDigit(lexer)) advance(lexer);
if (peek(lexer) == '.' && (peekNext(lexer) >= '0' && peekNext(lexer) <= '9')) { //BUGFIX: peekNext == digit
type = TOY_TOKEN_LITERAL_FLOAT;
advance(lexer);
while(isDigit(lexer)) advance(lexer);
}
Toy_Token token;
token.type = type;
token.lexeme = &lexer->source[lexer->start];
token.length = lexer->current - lexer->start;
token.line = lexer->line;
#ifndef TOY_EXPORT
if (Toy_commandLine.verbose) {
if (type == TOY_TOKEN_LITERAL_INTEGER) {
printf("int:");
} else {
printf("flt:");
}
Toy_printToken(&token);
}
#endif
return token;
}
static Toy_Token makeString(Toy_Lexer* lexer, char terminator) {
while (!isAtEnd(lexer) && peek(lexer) != terminator) {
advance(lexer);
}
advance(lexer); //eat terminator
if (isAtEnd(lexer)) {
return makeErrorToken(lexer, "Unterminated string");
}
Toy_Token token;
token.type = TOY_TOKEN_LITERAL_STRING;
token.lexeme = &lexer->source[lexer->start + 1];
token.length = lexer->current - lexer->start - 2;
token.line = lexer->line;
#ifndef TOY_EXPORT
if (Toy_commandLine.verbose) {
printf("str:");
Toy_printToken(&token);
}
#endif
return token;
}
static Toy_Token makeKeywordOrIdentifier(Toy_Lexer* lexer) {
advance(lexer); //first letter can only be alpha
while(isDigit(lexer) || isAlpha(lexer)) {
advance(lexer);
}
//scan for a keyword
for (int i = 0; Toy_keywordTypes[i].keyword; i++) {
if (strlen(Toy_keywordTypes[i].keyword) == (long unsigned int)(lexer->current - lexer->start) && !strncmp(Toy_keywordTypes[i].keyword, &lexer->source[lexer->start], lexer->current - lexer->start)) {
Toy_Token token;
token.type = Toy_keywordTypes[i].type;
token.lexeme = &lexer->source[lexer->start];
token.length = lexer->current - lexer->start;
token.line = lexer->line;
#ifndef TOY_EXPORT
if (Toy_commandLine.verbose) {
printf("kwd:");
Toy_printToken(&token);
}
#endif
return token;
}
}
//return an identifier
Toy_Token token;
token.type = TOY_TOKEN_IDENTIFIER;
token.lexeme = &lexer->source[lexer->start];
token.length = lexer->current - lexer->start;
token.line = lexer->line;
#ifndef TOY_EXPORT
if (Toy_commandLine.verbose) {
printf("idf:");
Toy_printToken(&token);
}
#endif
return token;
}
//exposed functions
void Toy_initLexer(Toy_Lexer* lexer, char* source) {
cleanLexer(lexer);
lexer->source = source;
}
Toy_Token Toy_scanLexer(Toy_Lexer* lexer) {
eatWhitespace(lexer);
lexer->start = lexer->current;
if (isAtEnd(lexer)) return makeToken(lexer, TOY_TOKEN_EOF);
if (isDigit(lexer)) return makeIntegerOrFloat(lexer);
if (isAlpha(lexer)) return makeKeywordOrIdentifier(lexer);
char c = advance(lexer);
switch(c) {
case '(': return makeToken(lexer, TOY_TOKEN_PAREN_LEFT);
case ')': return makeToken(lexer, TOY_TOKEN_PAREN_RIGHT);
case '{': return makeToken(lexer, TOY_TOKEN_BRACE_LEFT);
case '}': return makeToken(lexer, TOY_TOKEN_BRACE_RIGHT);
case '[': return makeToken(lexer, TOY_TOKEN_BRACKET_LEFT);
case ']': return makeToken(lexer, TOY_TOKEN_BRACKET_RIGHT);
case '+': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_PLUS_ASSIGN : match(lexer, '+') ? TOY_TOKEN_PLUS_PLUS: TOY_TOKEN_PLUS);
case '-': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_MINUS_ASSIGN : match(lexer, '-') ? TOY_TOKEN_MINUS_MINUS: TOY_TOKEN_MINUS);
case '*': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_MULTIPLY_ASSIGN : TOY_TOKEN_MULTIPLY);
case '/': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_DIVIDE_ASSIGN : TOY_TOKEN_DIVIDE);
case '%': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_MODULO_ASSIGN : TOY_TOKEN_MODULO);
case '!': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_NOT_EQUAL : TOY_TOKEN_NOT);
case '=': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_EQUAL : TOY_TOKEN_ASSIGN);
case '<': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_LESS_EQUAL : TOY_TOKEN_LESS);
case '>': return makeToken(lexer, match(lexer, '=') ? TOY_TOKEN_GREATER_EQUAL : TOY_TOKEN_GREATER);
case '&': //TOKEN_AND not used
if (advance(lexer) != '&') {
return makeErrorToken(lexer, "Unexpected '&'");
} else {
return makeToken(lexer, TOY_TOKEN_AND);
}
case '|': return makeToken(lexer, match(lexer, '|') ? TOY_TOKEN_OR : TOY_TOKEN_PIPE);
case '?': return makeToken(lexer, TOY_TOKEN_QUESTION);
case ':': return makeToken(lexer, TOY_TOKEN_COLON);
case ';': return makeToken(lexer, TOY_TOKEN_SEMICOLON);
case ',': return makeToken(lexer, TOY_TOKEN_COMMA);
case '.':
if (peek(lexer) == '.' && peekNext(lexer) == '.') {
advance(lexer);
advance(lexer);
return makeToken(lexer, TOY_TOKEN_REST);
}
return makeToken(lexer, TOY_TOKEN_DOT);
case '"':
return makeString(lexer, c);
//TODO: possibly support interpolated strings
default: {
char buffer[128];
snprintf(buffer, 128, "Unexpected token: %c", c);
return makeErrorToken(lexer, buffer);
}
}
}
static void trim(char** s, int* l) { //all this to remove a newline?
while( isspace(( (*((unsigned char**)(s)))[(*l) - 1] )) ) (*l)--;
while(**s && isspace( **(unsigned char**)(s)) ) { (*s)++; (*l)--; }
}
//for debugging
void Toy_printToken(Toy_Token* token) {
if (token->type == TOY_TOKEN_ERROR) {
printf(TOY_CC_ERROR "Error\t%d\t%.*s\n" TOY_CC_RESET, token->line, token->length, token->lexeme);
return;
}
printf("\t%d\t%d\t", token->type, token->line);
if (token->type == TOY_TOKEN_IDENTIFIER || token->type == TOY_TOKEN_LITERAL_INTEGER || token->type == TOY_TOKEN_LITERAL_FLOAT || token->type == TOY_TOKEN_LITERAL_STRING) {
printf("%.*s\t", token->length, token->lexeme);
} else {
char* keyword = Toy_findKeywordByType(token->type);
if (keyword != NULL) {
printf("%s", keyword);
} else {
char* str = token->lexeme;
int length = token->length;
trim(&str, &length);
printf("%.*s", length, str);
}
}
printf("\n");
}