Wrote a basic lexer

This commit is contained in:
2022-08-03 09:35:20 +01:00
parent 3cbf7b13eb
commit 3cad70dddd
12 changed files with 884 additions and 0 deletions

34
makefile Normal file
View File

@@ -0,0 +1,34 @@
export OUTDIR = out
all: $(OUTDIR)
$(MAKE) -C source
$(OUTDIR):
mkdir $(OUTDIR)
.PHONY: clean
clean:
ifeq ($(findstring CYGWIN, $(shell uname)),CYGWIN)
find . -type f -name '*.o' -exec rm -f -r -v {} \;
find . -type f -name '*.a' -exec rm -f -r -v {} \;
find . -type f -name '*.exe' -exec rm -f -r -v {} \;
find . -type f -name '*.dll' -exec rm -f -r -v {} \;
find . -type f -name '*.lib' -exec rm -f -r -v {} \;
find . -type f -name '*.so' -exec rm -f -r -v {} \;
find . -empty -type d -delete
else ifeq ($(shell uname), Linux)
find . -type f -name '*.o' -exec rm -f -r -v {} \;
find . -type f -name '*.a' -exec rm -f -r -v {} \;
find . -type f -name '*.exe' -exec rm -f -r -v {} \;
find . -type f -name '*.dll' -exec rm -f -r -v {} \;
find . -type f -name '*.lib' -exec rm -f -r -v {} \;
find . -type f -name '*.so' -exec rm -f -r -v {} \;
find . -empty -type d -delete
else ifeq ($(OS),Windows_NT)
$(RM) *.o *.a *.exe
else
@echo "Deletion failed - what platform is this?"
endif
rebuild: clean all

1
scripts/test.toy Normal file
View File

@@ -0,0 +1 @@
print "Hello world";

11
source/common.h Normal file
View File

@@ -0,0 +1,11 @@
#pragma once
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#define TOY_VERSION_MAJOR 0
#define TOY_VERSION_MINOR 6
#define TOY_VERSION_PATCH 0
#define TOY_VERSION_BUILD __DATE__

103
source/debug.c Normal file
View File

@@ -0,0 +1,103 @@
#include "debug.h"
#include "keyword_types.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void printToken(Token* token) {
if (token->type == TOKEN_ERROR) {
printf("Error\t%d\t%.*s\n", token->line, token->length, token->lexeme);
return;
}
printf("\t%d\t%d\t", token->type, token->line);
if (token->type == TOKEN_IDENTIFIER || token->type == TOKEN_LITERAL_INTEGER || token->type == TOKEN_LITERAL_FLOAT || token->type == TOKEN_LITERAL_STRING) {
printf("%.*s\t", token->length, token->lexeme);
} else {
char* keyword = findKeywordByType(token->type);
if (keyword != NULL) {
printf("%s", keyword);
} else {
printf("-");
}
}
printf("\n");
}
//declare the singleton
Command command;
void initCommand(int argc, const char* argv[]) {
//default values
command.error = false;
command.help = false;
command.version = false;
command.filename = NULL;
command.source = NULL;
command.verbose = false;
for (int i = 1; i < argc; i++) { //start at 1 to skip the program name
if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
command.help = true;
continue;
}
if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
command.version = true;
continue;
}
if ((!strcmp(argv[i], "-f") || !strcmp(argv[i], "--file")) && i + 1 < argc) {
command.filename = (char*)argv[i + 1];
i++;
continue;
}
if ((!strcmp(argv[i], "-i") || !strcmp(argv[i], "--input")) && i + 1 < argc) {
command.source = (char*)argv[i + 1];
i++;
continue;
}
if (!strcmp(argv[i], "-d") || !strcmp(argv[i], "--debug")) {
command.verbose = true;
continue;
}
command.error = true;
}
//no arguments
if (argc == 1) {
command.error = true;
}
}
void usageCommand(int argc, const char* argv[]) {
printf("Usage: %s [-h | -v | [-d][-f filename | -i source]]\n\n", argv[0]);
}
void helpCommand(int argc, const char* argv[]) {
usageCommand(argc, argv);
printf("-h | --help\t\tShow this help then exit.\n");
printf("-v | --version\t\tShow version and copyright information then exit.\n");
printf("-f | --file filename\tParse and execute the source file.\n");
printf("-i | --input source\tParse and execute this given string of source code.\n");
printf("-d | --debug\t\tBe verbose when operating.\n");
}
void copyrightCommand(int argc, const char* argv[]) {
printf("Toy Programming Language Interpreter Version %d.%d.%d (built on %s)\n\n", TOY_VERSION_MAJOR, TOY_VERSION_MINOR, TOY_VERSION_PATCH, TOY_VERSION_BUILD);
printf("Copyright (c) 2020-2022 Kayne Ruse, KR Game Studios\n\n");
printf("This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.\n\n");
printf("Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:\n\n");
printf("1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n\n");
printf("2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n\n");
printf("3. This notice may not be removed or altered from any source distribution.\n\n");
}

24
source/debug.h Normal file
View File

@@ -0,0 +1,24 @@
#pragma once
#include "common.h"
#include "lexer.h"
void printToken(Token* token);
//for processing the command line arguments
typedef struct {
bool error;
bool help;
bool version;
char* filename;
char* source;
bool verbose;
} Command;
extern Command command;
void initCommand(int argc, const char* argv[]);
void usageCommand(int argc, const char* argv[]);
void helpCommand(int argc, const char* argv[]);
void copyrightCommand(int argc, const char* argv[]);

62
source/keyword_types.c Normal file
View File

@@ -0,0 +1,62 @@
#include "keyword_types.h"
#include "common.h"
KeywordType keywordTypes[] = {
//type keywords
{TOKEN_NULL, "null"},
{TOKEN_BOOLEAN, "bool"},
{TOKEN_INTEGER, "int"},
{TOKEN_FLOAT, "float"},
{TOKEN_STRING, "string"},
{TOKEN_ARRAY, "array"},
{TOKEN_DICTIONARY, "dictionary"},
{TOKEN_FUNCTION, "function"},
{TOKEN_ANY, "any"},
//other keywords
{TOKEN_AS, "as"},
{TOKEN_ASSERT, "assert"},
{TOKEN_BREAK, "break"},
{TOKEN_CLASS, "class"},
{TOKEN_CONST, "const"},
{TOKEN_CONTINUE, "continue"},
{TOKEN_DO, "do"},
{TOKEN_ELSE, "else"},
{TOKEN_EXPORT, "export"},
{TOKEN_FOR, "for"},
{TOKEN_FOREACH, "foreach"},
{TOKEN_IF, "if"},
{TOKEN_IMPORT, "import"},
{TOKEN_IN, "in"},
{TOKEN_OF, "of"},
{TOKEN_PRINT, "print"},
{TOKEN_RETURN, "return"},
{TOKEN_USING, "using"},
{TOKEN_VAR, "var"},
{TOKEN_WHILE, "while"},
//literal values
{TOKEN_LITERAL_TRUE, "true"},
{TOKEN_LITERAL_FALSE, "false"},
//meta tokens
{TOKEN_PASS, "pass"},
{TOKEN_ERROR, "error"},
{TOKEN_EOF, NULL},
};
char* findKeywordByType(TokenType type) {
if (type == TOKEN_EOF) {
return "EOF";
}
for(int i = 0; keywordTypes[i].keyword; i++) {
if (keywordTypes[i].type == type) {
return keywordTypes[i].keyword;
}
}
return NULL;
}

13
source/keyword_types.h Normal file
View File

@@ -0,0 +1,13 @@
#pragma once
#include "token_types.h"
typedef struct {
TokenType type;
char* keyword;
} KeywordType;
extern KeywordType keywordTypes[];
//for debugging
char* findKeywordByType(TokenType type);

297
source/lexer.c Normal file
View File

@@ -0,0 +1,297 @@
#include "lexer.h"
#include "keyword_types.h"
#include "debug.h"
#include <stdio.h>
#include <string.h>
//static generic utility functions
static void cleanLexer(Lexer* lexer) {
lexer->source = NULL;
lexer->start = 0;
lexer->current = 0;
lexer->line = 1;
}
static bool isAtEnd(Lexer* lexer) {
return lexer->source[lexer->current] == '\0';
}
static char peek(Lexer* lexer) {
return lexer->source[lexer->current];
}
static char peekNext(Lexer* lexer) {
if (isAtEnd(lexer)) return '\0';
return lexer->source[lexer->current + 1];
}
static char advance(Lexer* lexer) {
if (isAtEnd(lexer)) {
return '\0';
}
//new line
if (lexer->source[lexer->current] == '\n') {
lexer->line++;
}
lexer->current++;
return lexer->source[lexer->current - 1];
}
static void eatWhitespace(Lexer* lexer) {
const char c = peek(lexer);
switch(c) {
case ' ':
case '\r':
case '\n':
case '\t':
advance(lexer);
break;
//comments
case '/':
//eat the line
if (peekNext(lexer) == '/') {
while (advance(lexer) != '\n' && !isAtEnd(lexer));
break;
}
//eat the block
if (peekNext(lexer) == '*') {
advance(lexer);
advance(lexer);
while(!(peek(lexer) == '*' && peekNext(lexer) == '/')) advance(lexer);
advance(lexer);
advance(lexer);
break;
}
default:
return;
}
//tail recursion
eatWhitespace(lexer);
}
static bool isDigit(Lexer* lexer) {
return peek(lexer) >= '0' && peek(lexer) <= '9';
}
static bool isAlpha(Lexer* lexer) {
return
(peek(lexer) >= 'A' && peek(lexer) <= 'Z') ||
(peek(lexer) >= 'a' && peek(lexer) <= 'z') ||
peek(lexer) == '_'
;
}
static bool match(Lexer* lexer, char c) {
if (peek(lexer) == c) {
advance(lexer);
return true;
}
return false;
}
//token generators
static Token makeErrorToken(Lexer* lexer, char* msg) {
Token token;
token.type = TOKEN_ERROR;
token.lexeme = msg;
token.length = strlen(msg);
token.line = lexer->line;
if (command.verbose) {
printf("err:");
printToken(&token);
}
return token;
}
static Token makeToken(Lexer* lexer, TokenType type) {
Token token;
token.type = type;
token.lexeme = &lexer->source[lexer->current - 1];
token.length = 1;
token.line = lexer->line;
if (command.verbose) {
printf("tok:");
printToken(&token);
}
return token;
}
static Token makeIntegerOrFloat(Lexer* lexer) {
TokenType type = TOKEN_LITERAL_INTEGER; //what am I making?
while(isDigit(lexer)) advance(lexer);
if (peek(lexer) == '.') {
type = TOKEN_LITERAL_FLOAT;
advance(lexer);
while(isDigit(lexer)) advance(lexer);
}
Token token;
token.type = type;
token.lexeme = &lexer->source[lexer->start];
token.length = lexer->current - lexer->start;
token.line = lexer->line;
if (command.verbose) {
if (type == TOKEN_LITERAL_INTEGER) {
printf("int:");
} else {
printf("flt:");
}
printToken(&token);
}
return token;
}
static Token makeString(Lexer* lexer, char terminator) {
while (!isAtEnd(lexer) && peek(lexer) != terminator) {
advance(lexer);
}
advance(lexer); //eat terminator
if (isAtEnd(lexer)) {
return makeErrorToken(lexer, "Unterminated string");
}
Token token;
token.type = TOKEN_LITERAL_STRING;
token.lexeme = &lexer->source[lexer->start + 1];
token.length = lexer->current - lexer->start - 2;
token.line = lexer->line;
if (command.verbose) {
printf("str:");
printToken(&token);
}
return token;
}
static Token makeKeywordOrIdentifier(Lexer* lexer) {
advance(lexer); //first letter can only be alpha
while(isDigit(lexer) || isAlpha(lexer)) {
advance(lexer);
}
//scan for a keyword
for (int i = 0; keywordTypes[i].keyword; i++) {
if (strlen(keywordTypes[i].keyword) == (long unsigned int)(lexer->current - lexer->start) && !strncmp(keywordTypes[i].keyword, &lexer->source[lexer->start], lexer->current - lexer->start)) {
Token token;
token.type = keywordTypes[i].type;
token.lexeme = &lexer->source[lexer->start];
token.length = lexer->current - lexer->start;
token.line = lexer->line;
if (command.verbose) {
printf("kwd:");
printToken(&token);
}
return token;
}
}
//return an identifier
Token token;
token.type = TOKEN_IDENTIFIER;
token.lexeme = &lexer->source[lexer->start];
token.length = lexer->current - lexer->start;
token.line = lexer->line;
if (command.verbose) {
printf("idf:");
printToken(&token);
}
return token;
}
//exposed functions
void initLexer(Lexer* lexer, char* source) {
cleanLexer(lexer);
lexer->source = source;
}
Token scanLexer(Lexer* lexer) {
eatWhitespace(lexer);
lexer->start = lexer->current;
if (isAtEnd(lexer)) return makeToken(lexer, TOKEN_EOF);
if (isDigit(lexer)) return makeIntegerOrFloat(lexer);
if (isAlpha(lexer)) return makeKeywordOrIdentifier(lexer);
char c = advance(lexer);
switch(c) {
case '(': return makeToken(lexer, TOKEN_PAREN_LEFT);
case ')': return makeToken(lexer, TOKEN_PAREN_RIGHT);
case '{': return makeToken(lexer, TOKEN_BRACE_LEFT);
case '}': return makeToken(lexer, TOKEN_BRACE_RIGHT);
case '[': return makeToken(lexer, match(lexer, ']') ? TOKEN_ARRAY : TOKEN_BRACKET_LEFT);
case ']': return makeToken(lexer, TOKEN_BRACKET_RIGHT);
case '+': return makeToken(lexer, match(lexer, '=') ? TOKEN_PLUS_ASSIGN : match(lexer, '+') ? TOKEN_PLUS_PLUS: TOKEN_PLUS);
case '-': return makeToken(lexer, match(lexer, '=') ? TOKEN_MINUS_ASSIGN : match(lexer, '-') ? TOKEN_MINUS_MINUS: TOKEN_MINUS);
case '*': return makeToken(lexer, match(lexer, '=') ? TOKEN_MULTIPLY_ASSIGN : TOKEN_MULTIPLY);
case '/': return makeToken(lexer, match(lexer, '=') ? TOKEN_DIVIDE_ASSIGN : TOKEN_DIVIDE);
case '%': return makeToken(lexer, match(lexer, '=') ? TOKEN_MODULO_ASSIGN : TOKEN_MODULO);
case '!': return makeToken(lexer, match(lexer, '=') ? TOKEN_NOT_EQUAL : TOKEN_NOT);
case '=': return makeToken(lexer, match(lexer, '=') ? TOKEN_EQUAL : TOKEN_ASSIGN);
case '<': return makeToken(lexer, match(lexer, '=') ? TOKEN_LESS_EQUAL : TOKEN_LESS);
case '>': return makeToken(lexer, match(lexer, '=') ? TOKEN_GREATER_EQUAL : TOKEN_GREATER);
case '&': //TOKEN_AND not used
if (advance(lexer) != '&') {
return makeErrorToken(lexer, "Unexpected '&'");
} else {
return makeToken(lexer, TOKEN_AND);
}
case '|': return makeToken(lexer, match(lexer, '|') ? TOKEN_OR : TOKEN_PIPE);
case ':': return makeToken(lexer, TOKEN_COLON);
case ';': return makeToken(lexer, TOKEN_SEMICOLON);
case ',': return makeToken(lexer, TOKEN_COMMA);
case '.':
if (peek(lexer) == '.' && peekNext(lexer) == ',') {
return makeToken(lexer, TOKEN_REST);
}
return makeToken(lexer, TOKEN_DOT);
case '"':
return makeString(lexer, c);
//TODO: possibly support interpolated strings
default:
return makeErrorToken(lexer, "Unexpected token");
}
}

24
source/lexer.h Normal file
View File

@@ -0,0 +1,24 @@
#pragma once
#include "common.h"
#include "token_types.h"
//lexers are bound to a string of code, and return a single token every time scan is called
typedef struct {
char* source;
int start; //start of the token
int current; //current position of the lexer
int line; //track this for error handling
} Lexer;
//tokens are intermediaries between lexers and parsers
typedef struct {
TokenType type;
char* lexeme;
int length;
int line;
} Token;
void initLexer(Lexer* lexer, char* source);
Token scanLexer(Lexer* lexer);

27
source/makefile Normal file
View File

@@ -0,0 +1,27 @@
CC=gcc
IDIR =.
CFLAGS=$(addprefix -I,$(IDIR)) -g -Wall -W -pedantic
LIBS=
ODIR=obj
SRC = $(wildcard *.c)
OBJ = $(addprefix $(ODIR)/,$(SRC:.c=.o))
OUT = ../$(OUTDIR)/toy
all: $(OBJ)
$(CC) -o $(OUT) $^ $(CFLAGS) $(LIBS)
$(OBJ): | $(ODIR)
$(ODIR):
mkdir $(ODIR)
$(ODIR)/%.o: %.c
$(CC) -c -o $@ $< $(CFLAGS)
.PHONY: clean
clean:
$(RM) $(ODIR)

197
source/repl_main.c Normal file
View File

@@ -0,0 +1,197 @@
#include "debug.h"
#include "lexer.h"
//-#include "parser.h"
//#include "toy.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//read a file and return it as a char array
char* readFile(char* path) {
FILE* file = fopen(path, "rb");
if (file == NULL) {
fprintf(stderr, "Could not open file \"%s\"\n", path);
exit(74);
}
fseek(file, 0L, SEEK_END);
size_t fileSize = ftell(file);
rewind(file);
char* buffer = (char*)malloc(fileSize + 1);
if (buffer == NULL) {
fprintf(stderr, "Not enough memory to read \"%s\"\n", path);
exit(74);
}
size_t bytesRead = fread(buffer, sizeof(char), fileSize, file);
if (bytesRead < fileSize) {
fprintf(stderr, "Could not read file \"%s\"\n", path);
exit(74);
}
fclose(file);
buffer[bytesRead] = '\0';
return buffer;
}
/*
//run functions
void runString(char* source) {
Lexer lexer;
Parser parser;
Toy toy;
initLexer(&lexer, source);
initParser(&parser, &lexer);
initToy(&toy);
Chunk* chunk = scanParser(&parser);
if (chunk->count > 1 && command.verbose) {
printChunk(chunk, " ");
}
executeChunk(&toy, chunk);
freeChunk(chunk);
freeToy(&toy);
freeParser(&parser);
}
void runFile(char* fname) {
char* source = readFile(fname);
runString(source);
free((void*)source);
}
void repl() {
const int size = 2048;
char input[size];
memset(input, 0, size);
Parser parser;
Toy toy;
initToy(&toy);
for(;;) {
printf(">");
fgets(input, size, stdin);
//setup
Lexer lexer;
initLexer(&lexer, input);
initParser(&parser, &lexer);
//run
Chunk* chunk = scanParser(&parser);
if (chunk->count > 1 && command.verbose) {
printChunk(chunk, " ");
}
//clean up the memory
if (parser.error) {
freeChunk(chunk);
freeParser(&parser);
continue;
}
executeChunk(&toy, chunk);
if (toy.panic) {
toy.panic = false;
freeChunk(chunk);
freeParser(&parser);
continue;
}
freeChunk(chunk);
//cleanup
freeParser(&parser);
}
freeToy(&toy);
}
*/
void debug() {
Lexer lexer;
Token token;
char* source = readFile(command.filename);
initLexer(&lexer, source);
//run the lexer until the end of the source
do {
token = scanLexer(&lexer);
} while(token.type != TOKEN_EOF);
}
//entry point
int main(int argc, const char* argv[]) {
initCommand(argc, argv);
//command specific actions
if (command.error) {
usageCommand(argc, argv);
return 0;
}
if (command.help) {
helpCommand(argc, argv);
return 0;
}
if (command.version) {
copyrightCommand(argc, argv);
return 0;
}
//print this until the interpreter meets the specification
if (command.verbose) {
printf("Warning! This interpreter is a work in progress, it does not yet meet the %d.%d.%d specification.\n", TOY_VERSION_MAJOR, TOY_VERSION_MINOR, TOY_VERSION_PATCH);
}
if (command.filename) {
debug();
// runFile(command.filename);
return 0;
}
if (command.source) {
// runString(command.source);
// Lexer lexer;
// initLexer(&lexer, command.source);
// //debugging
// while(true) {
// Token token = scanLexer(&lexer);
// if (token.type == TOKEN_EOF) {
// break;
// }
// }
return 0;
}
// repl();
return 0;
}

91
source/token_types.h Normal file
View File

@@ -0,0 +1,91 @@
#pragma once
typedef enum TokenType {
//types
TOKEN_NULL,
TOKEN_BOOLEAN,
TOKEN_INTEGER,
TOKEN_FLOAT,
TOKEN_STRING,
TOKEN_ARRAY,
TOKEN_DICTIONARY,
TOKEN_FUNCTION,
TOKEN_ANY,
//keywords and reserved words
TOKEN_AS,
TOKEN_ASSERT,
TOKEN_BREAK,
TOKEN_CLASS,
TOKEN_CONST,
TOKEN_CONTINUE,
TOKEN_DO,
TOKEN_ELSE,
TOKEN_EXPORT,
TOKEN_FOR,
TOKEN_FOREACH,
TOKEN_IF,
TOKEN_IMPORT,
TOKEN_IN,
TOKEN_OF,
TOKEN_PRINT,
TOKEN_RETURN,
TOKEN_USING,
TOKEN_VAR,
TOKEN_WHILE,
//literal values
TOKEN_IDENTIFIER,
TOKEN_LITERAL_NULL,
TOKEN_LITERAL_TRUE,
TOKEN_LITERAL_FALSE,
TOKEN_LITERAL_INTEGER,
TOKEN_LITERAL_FLOAT,
TOKEN_LITERAL_STRING,
//math operators
TOKEN_PLUS,
TOKEN_MINUS,
TOKEN_MULTIPLY,
TOKEN_DIVIDE,
TOKEN_MODULO,
TOKEN_PLUS_ASSIGN,
TOKEN_MINUS_ASSIGN,
TOKEN_MULTIPLY_ASSIGN,
TOKEN_DIVIDE_ASSIGN,
TOKEN_MODULO_ASSIGN,
TOKEN_PLUS_PLUS,
TOKEN_MINUS_MINUS,
//logical operators
TOKEN_PAREN_LEFT,
TOKEN_PAREN_RIGHT,
TOKEN_BRACKET_LEFT,
TOKEN_BRACKET_RIGHT,
TOKEN_BRACE_LEFT,
TOKEN_BRACE_RIGHT,
TOKEN_NOT,
TOKEN_NOT_EQUAL,
TOKEN_EQUAL,
TOKEN_LESS,
TOKEN_GREATER,
TOKEN_LESS_EQUAL,
TOKEN_GREATER_EQUAL,
TOKEN_AND,
TOKEN_OR,
//other operators
TOKEN_ASSIGN,
TOKEN_COLON,
TOKEN_SEMICOLON,
TOKEN_COMMA,
TOKEN_DOT,
TOKEN_PIPE,
TOKEN_REST,
//meta tokens
TOKEN_PASS,
TOKEN_ERROR,
TOKEN_EOF,
} TokenType;