From 7c054db9e6a42fc8091044a5fbac87730067128d Mon Sep 17 00:00:00 2001 From: Kayne Ruse Date: Sat, 8 Feb 2025 17:27:47 +1100 Subject: [PATCH] Compiler now reuses existing strings in the data, read more If a string exists in the data, instead of being written, the function 'emitCStringToData()' will instead return the address of the match within the data section. Then, I can search the jump table for that address, and use the existing jump entry or append a new one. Fixes #168 --- source/toy_module_bundle.c | 2 +- source/toy_module_compiler.c | 64 +++++++++++++++----- tests/cases/test_module_compiler.c | 94 ++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 17 deletions(-) diff --git a/source/toy_module_bundle.c b/source/toy_module_bundle.c index 3ecf36f..15febc0 100644 --- a/source/toy_module_bundle.c +++ b/source/toy_module_bundle.c @@ -126,7 +126,7 @@ void Toy_bindModuleBundle(Toy_ModuleBundle* bundle, unsigned char* ptr, unsigned memcpy(bundle->ptr, ptr, size); bundle->count = size; - //URGENT: test this + //TODO: test this int valid = validateModuleBundleHeader(bundle); if (valid < 0) { diff --git a/source/toy_module_compiler.c b/source/toy_module_compiler.c index a9410f0..cea7dd9 100644 --- a/source/toy_module_compiler.c +++ b/source/toy_module_compiler.c @@ -108,44 +108,76 @@ static void emitFloat(unsigned char** handle, unsigned int* capacity, unsigned i //simply get the address (always an integer) #define CURRENT_ADDRESS(mb, part) ((*mb)->part##Count) -static void emitToJumpTable(Toy_ModuleCompiler** mb, unsigned int startAddr) { - EMIT_INT(mb, code, (*mb)->jumpsCount); //mark the jump index in the code - EMIT_INT(mb, jumps, startAddr); //save address at the jump index +//Cached write to data, enabling string reuse, see #168 +static unsigned int emitCStringToData(unsigned char** dataHandle, unsigned int* capacity, unsigned int* count, const char* cstr) { + const unsigned int slen = (unsigned int)strlen(cstr) + 1; //+1 for null + + //See if the string already exists in the data NOTE: assumes data only ever holds c-strings + unsigned int pos = 0; + while (pos < *count) { + const char* entry = ((char*)(*dataHandle)) + pos; + unsigned int elen = strlen(entry) + 1; //+1 for null + + //compare + if (slen == elen && strncmp(cstr, entry, slen) == 0) { + return pos; + } + + //next + pos += (elen + 3) & ~3; + } + + //default, append the new entry + unsigned int addr = *count; //save the target address + expand(dataHandle, capacity, count, (slen + 3) & ~3); //4-byte aligned + memcpy((*dataHandle) + addr, cstr, slen); + *count += (slen + 3) & ~3; + + return addr; //return the address of the string in the data section } static unsigned int emitString(Toy_ModuleCompiler** mb, Toy_String* str) { //4-byte alignment unsigned int length = str->info.length + 1; - if (length % 4 != 0) { - length += 4 - (length % 4); //ceil - } + length = (length + 3) & ~3; - //grab the current start address - unsigned int startAddr = (*mb)->dataCount; + //the address within the data section + unsigned int dataAddr = 0; //move the string into the data section - expand((&((*mb)->data)), &((*mb)->dataCapacity), &((*mb)->dataCount), length); - if (str->info.type == TOY_STRING_NODE) { char* buffer = Toy_getStringRawBuffer(str); - memcpy((*mb)->data + (*mb)->dataCount, buffer, str->info.length + 1); + + dataAddr = emitCStringToData(&(*mb)->data, &(*mb)->dataCapacity, &(*mb)->dataCount, buffer); + free(buffer); } else if (str->info.type == TOY_STRING_LEAF) { - memcpy((*mb)->data + (*mb)->dataCount, str->leaf.data, str->info.length + 1); + dataAddr = emitCStringToData(&(*mb)->data, &(*mb)->dataCapacity, &(*mb)->dataCount, str->leaf.data); } else if (str->info.type == TOY_STRING_NAME) { - memcpy((*mb)->data + (*mb)->dataCount, str->name.data, str->info.length + 1); + dataAddr = emitCStringToData(&(*mb)->data, &(*mb)->dataCapacity, &(*mb)->dataCount, str->name.data); } - (*mb)->dataCount += length; + //mark the position within the jump index, reusing an existing entry if it exists + for (unsigned int i = 0; i < (*mb)->jumpsCount; i++) { + if ((*mb)->jumps[i] == dataAddr) { + //reuse, and finish + EMIT_INT(mb, code, i); + return 1; + } + } - //mark the jump position - emitToJumpTable(mb, startAddr); + EMIT_INT(mb, code, (*mb)->jumpsCount); //mark the new jump index in the code + EMIT_INT(mb, jumps, dataAddr); //append to the jump table return 1; } +// static unsigned int emitParameter(Toy_ModuleCompiler** mb, Toy_String* str) { +// +// } + static unsigned int writeModuleCompilerCode(Toy_ModuleCompiler** mb, Toy_Ast* ast); //forward declare for recursion static unsigned int writeInstructionAssign(Toy_ModuleCompiler** mb, Toy_AstVarAssign ast, bool chainedAssignment); //forward declare for chaining of var declarations diff --git a/tests/cases/test_module_compiler.c b/tests/cases/test_module_compiler.c index 6fe0117..e013873 100644 --- a/tests/cases/test_module_compiler.c +++ b/tests/cases/test_module_compiler.c @@ -1309,6 +1309,90 @@ int test_compiler_keywords(Toy_Bucket** bucketHandle) { return 0; } +int test_compiler_string_reuse(Toy_Bucket** bucketHandle) { + //test string literals + { + //setup + const char* source = "var first: string = \"Hello world\"; var second: string = \"Hello world\";"; + Toy_Lexer lexer; + Toy_Parser parser; + + Toy_bindLexer(&lexer, source); + Toy_bindParser(&parser, &lexer); + Toy_Ast* ast = Toy_scanParser(bucketHandle, &parser); + + //run + unsigned char* buffer = Toy_compileModule(ast); + + //check header + int* ptr = (int*)buffer; + + if ((ptr++)[0] != 108 || //total size + (ptr++)[0] != 12 || //jump count + (ptr++)[0] != 0 || //param count + (ptr++)[0] != 28 || //data count + (ptr++)[0] != 0 || //subs count + (ptr++)[0] != 32 || //code addr + (ptr++)[0] != 68 || //jumps addr + (ptr++)[0] != 80 || //data addr + false) //terminator + { + fprintf(stderr, TOY_CC_ERROR "ERROR: failed to reuse strings in module header, source: %s\n" TOY_CC_RESET, source); + + //cleanup and return + free(buffer); + return -1; + } + + //check code + if (*((unsigned char*)(buffer + 32)) != TOY_OPCODE_READ || + *((unsigned char*)(buffer + 33)) != TOY_VALUE_STRING || + *((unsigned char*)(buffer + 34)) != TOY_STRING_LEAF || + *((unsigned char*)(buffer + 35)) != 0 || + + *((unsigned int*)(buffer + 36)) != 0 || + + *((unsigned char*)(buffer + 40)) != TOY_OPCODE_DECLARE || + *((unsigned char*)(buffer + 41)) != TOY_VALUE_STRING || + *((unsigned char*)(buffer + 42)) != 5 || + *((unsigned char*)(buffer + 43)) != 0 || + + *((unsigned int*)(buffer + 44)) != 4 || + + *((unsigned char*)(buffer + 48)) != TOY_OPCODE_READ || + *((unsigned char*)(buffer + 49)) != TOY_VALUE_STRING || + *((unsigned char*)(buffer + 50)) != TOY_STRING_LEAF || + *((unsigned char*)(buffer + 51)) != 0 || + + *((unsigned int*)(buffer + 52)) != 0 || //duplicate + + *((unsigned char*)(buffer + 56)) != TOY_OPCODE_DECLARE || + *((unsigned char*)(buffer + 57)) != TOY_VALUE_STRING || + *((unsigned char*)(buffer + 58)) != 6 || + *((unsigned char*)(buffer + 59)) != 0 || + + *((unsigned int*)(buffer + 60)) != 8 || + + *((unsigned char*)(buffer + 64)) != TOY_OPCODE_RETURN || + *((unsigned char*)(buffer + 65)) != 0 || + *((unsigned char*)(buffer + 66)) != 0 || + *((unsigned char*)(buffer + 67)) != 0 + ) + { + fprintf(stderr, TOY_CC_ERROR "ERROR: failed to produce the expected module code, source: %s\n" TOY_CC_RESET, source); + + //cleanup and return + free(buffer); + return -1; + } + + //cleanup + free(buffer); + } + + return 0; +} + int main(void) { //run each test set, returning the total errors given int total = 0, res = 0; @@ -1343,5 +1427,15 @@ int main(void) { total += res; } + { + Toy_Bucket* bucket = Toy_allocateBucket(TOY_BUCKET_IDEAL); + res = test_compiler_string_reuse(&bucket); + Toy_freeBucket(&bucket); + if (res == 0) { + printf(TOY_CC_NOTICE "All good\n" TOY_CC_RESET); + } + total += res; + } + return total; }