Compiler now reuses existing strings in the data, read more

If a string exists in the data, instead of being written, the function
'emitCStringToData()' will instead return the address of the match
within the data section.

Then, I can search the jump table for that address, and use the existing
jump entry or append a new one.

Fixes #168
This commit is contained in:
2025-02-08 17:27:47 +11:00
parent 72f4e4c143
commit 7c054db9e6
3 changed files with 143 additions and 17 deletions

View File

@@ -126,7 +126,7 @@ void Toy_bindModuleBundle(Toy_ModuleBundle* bundle, unsigned char* ptr, unsigned
memcpy(bundle->ptr, ptr, size); memcpy(bundle->ptr, ptr, size);
bundle->count = size; bundle->count = size;
//URGENT: test this //TODO: test this
int valid = validateModuleBundleHeader(bundle); int valid = validateModuleBundleHeader(bundle);
if (valid < 0) { if (valid < 0) {

View File

@@ -108,44 +108,76 @@ static void emitFloat(unsigned char** handle, unsigned int* capacity, unsigned i
//simply get the address (always an integer) //simply get the address (always an integer)
#define CURRENT_ADDRESS(mb, part) ((*mb)->part##Count) #define CURRENT_ADDRESS(mb, part) ((*mb)->part##Count)
static void emitToJumpTable(Toy_ModuleCompiler** mb, unsigned int startAddr) { //Cached write to data, enabling string reuse, see #168
EMIT_INT(mb, code, (*mb)->jumpsCount); //mark the jump index in the code static unsigned int emitCStringToData(unsigned char** dataHandle, unsigned int* capacity, unsigned int* count, const char* cstr) {
EMIT_INT(mb, jumps, startAddr); //save address at the jump index const unsigned int slen = (unsigned int)strlen(cstr) + 1; //+1 for null
//See if the string already exists in the data NOTE: assumes data only ever holds c-strings
unsigned int pos = 0;
while (pos < *count) {
const char* entry = ((char*)(*dataHandle)) + pos;
unsigned int elen = strlen(entry) + 1; //+1 for null
//compare
if (slen == elen && strncmp(cstr, entry, slen) == 0) {
return pos;
}
//next
pos += (elen + 3) & ~3;
}
//default, append the new entry
unsigned int addr = *count; //save the target address
expand(dataHandle, capacity, count, (slen + 3) & ~3); //4-byte aligned
memcpy((*dataHandle) + addr, cstr, slen);
*count += (slen + 3) & ~3;
return addr; //return the address of the string in the data section
} }
static unsigned int emitString(Toy_ModuleCompiler** mb, Toy_String* str) { static unsigned int emitString(Toy_ModuleCompiler** mb, Toy_String* str) {
//4-byte alignment //4-byte alignment
unsigned int length = str->info.length + 1; unsigned int length = str->info.length + 1;
if (length % 4 != 0) { length = (length + 3) & ~3;
length += 4 - (length % 4); //ceil
}
//grab the current start address //the address within the data section
unsigned int startAddr = (*mb)->dataCount; unsigned int dataAddr = 0;
//move the string into the data section //move the string into the data section
expand((&((*mb)->data)), &((*mb)->dataCapacity), &((*mb)->dataCount), length);
if (str->info.type == TOY_STRING_NODE) { if (str->info.type == TOY_STRING_NODE) {
char* buffer = Toy_getStringRawBuffer(str); char* buffer = Toy_getStringRawBuffer(str);
memcpy((*mb)->data + (*mb)->dataCount, buffer, str->info.length + 1);
dataAddr = emitCStringToData(&(*mb)->data, &(*mb)->dataCapacity, &(*mb)->dataCount, buffer);
free(buffer); free(buffer);
} }
else if (str->info.type == TOY_STRING_LEAF) { else if (str->info.type == TOY_STRING_LEAF) {
memcpy((*mb)->data + (*mb)->dataCount, str->leaf.data, str->info.length + 1); dataAddr = emitCStringToData(&(*mb)->data, &(*mb)->dataCapacity, &(*mb)->dataCount, str->leaf.data);
} }
else if (str->info.type == TOY_STRING_NAME) { else if (str->info.type == TOY_STRING_NAME) {
memcpy((*mb)->data + (*mb)->dataCount, str->name.data, str->info.length + 1); dataAddr = emitCStringToData(&(*mb)->data, &(*mb)->dataCapacity, &(*mb)->dataCount, str->name.data);
} }
(*mb)->dataCount += length; //mark the position within the jump index, reusing an existing entry if it exists
for (unsigned int i = 0; i < (*mb)->jumpsCount; i++) {
if ((*mb)->jumps[i] == dataAddr) {
//reuse, and finish
EMIT_INT(mb, code, i);
return 1;
}
}
//mark the jump position EMIT_INT(mb, code, (*mb)->jumpsCount); //mark the new jump index in the code
emitToJumpTable(mb, startAddr); EMIT_INT(mb, jumps, dataAddr); //append to the jump table
return 1; return 1;
} }
// static unsigned int emitParameter(Toy_ModuleCompiler** mb, Toy_String* str) {
//
// }
static unsigned int writeModuleCompilerCode(Toy_ModuleCompiler** mb, Toy_Ast* ast); //forward declare for recursion static unsigned int writeModuleCompilerCode(Toy_ModuleCompiler** mb, Toy_Ast* ast); //forward declare for recursion
static unsigned int writeInstructionAssign(Toy_ModuleCompiler** mb, Toy_AstVarAssign ast, bool chainedAssignment); //forward declare for chaining of var declarations static unsigned int writeInstructionAssign(Toy_ModuleCompiler** mb, Toy_AstVarAssign ast, bool chainedAssignment); //forward declare for chaining of var declarations

View File

@@ -1309,6 +1309,90 @@ int test_compiler_keywords(Toy_Bucket** bucketHandle) {
return 0; return 0;
} }
int test_compiler_string_reuse(Toy_Bucket** bucketHandle) {
//test string literals
{
//setup
const char* source = "var first: string = \"Hello world\"; var second: string = \"Hello world\";";
Toy_Lexer lexer;
Toy_Parser parser;
Toy_bindLexer(&lexer, source);
Toy_bindParser(&parser, &lexer);
Toy_Ast* ast = Toy_scanParser(bucketHandle, &parser);
//run
unsigned char* buffer = Toy_compileModule(ast);
//check header
int* ptr = (int*)buffer;
if ((ptr++)[0] != 108 || //total size
(ptr++)[0] != 12 || //jump count
(ptr++)[0] != 0 || //param count
(ptr++)[0] != 28 || //data count
(ptr++)[0] != 0 || //subs count
(ptr++)[0] != 32 || //code addr
(ptr++)[0] != 68 || //jumps addr
(ptr++)[0] != 80 || //data addr
false) //terminator
{
fprintf(stderr, TOY_CC_ERROR "ERROR: failed to reuse strings in module header, source: %s\n" TOY_CC_RESET, source);
//cleanup and return
free(buffer);
return -1;
}
//check code
if (*((unsigned char*)(buffer + 32)) != TOY_OPCODE_READ ||
*((unsigned char*)(buffer + 33)) != TOY_VALUE_STRING ||
*((unsigned char*)(buffer + 34)) != TOY_STRING_LEAF ||
*((unsigned char*)(buffer + 35)) != 0 ||
*((unsigned int*)(buffer + 36)) != 0 ||
*((unsigned char*)(buffer + 40)) != TOY_OPCODE_DECLARE ||
*((unsigned char*)(buffer + 41)) != TOY_VALUE_STRING ||
*((unsigned char*)(buffer + 42)) != 5 ||
*((unsigned char*)(buffer + 43)) != 0 ||
*((unsigned int*)(buffer + 44)) != 4 ||
*((unsigned char*)(buffer + 48)) != TOY_OPCODE_READ ||
*((unsigned char*)(buffer + 49)) != TOY_VALUE_STRING ||
*((unsigned char*)(buffer + 50)) != TOY_STRING_LEAF ||
*((unsigned char*)(buffer + 51)) != 0 ||
*((unsigned int*)(buffer + 52)) != 0 || //duplicate
*((unsigned char*)(buffer + 56)) != TOY_OPCODE_DECLARE ||
*((unsigned char*)(buffer + 57)) != TOY_VALUE_STRING ||
*((unsigned char*)(buffer + 58)) != 6 ||
*((unsigned char*)(buffer + 59)) != 0 ||
*((unsigned int*)(buffer + 60)) != 8 ||
*((unsigned char*)(buffer + 64)) != TOY_OPCODE_RETURN ||
*((unsigned char*)(buffer + 65)) != 0 ||
*((unsigned char*)(buffer + 66)) != 0 ||
*((unsigned char*)(buffer + 67)) != 0
)
{
fprintf(stderr, TOY_CC_ERROR "ERROR: failed to produce the expected module code, source: %s\n" TOY_CC_RESET, source);
//cleanup and return
free(buffer);
return -1;
}
//cleanup
free(buffer);
}
return 0;
}
int main(void) { int main(void) {
//run each test set, returning the total errors given //run each test set, returning the total errors given
int total = 0, res = 0; int total = 0, res = 0;
@@ -1343,5 +1427,15 @@ int main(void) {
total += res; total += res;
} }
{
Toy_Bucket* bucket = Toy_allocateBucket(TOY_BUCKET_IDEAL);
res = test_compiler_string_reuse(&bucket);
Toy_freeBucket(&bucket);
if (res == 0) {
printf(TOY_CC_NOTICE "All good\n" TOY_CC_RESET);
}
total += res;
}
return total; return total;
} }