shithub: rgbds

Download patch

ref: 3a44cc7722315a305561cb89b63ecba674a6cdaa
parent: 6af57ff026c297b3572bd3b439fefc65e917d77d
parent: 4cfed3c98f014445d07ccdf7b91dc1c1f2ba0eaf
author: Eldred Habert <[email protected]>
date: Sun Oct 4 00:37:06 EDT 2020

Merge pull request #582 from ISSOtm/rewrite-charmap

Rewrite charmap system

--- a/include/asm/charmap.h
+++ b/include/asm/charmap.h
@@ -11,38 +11,12 @@
 
 #include <stdint.h>
 
-#include "asm/symbol.h"
-
-#define MAXCHARMAPS	512
-#define CHARMAPLENGTH	16
-#define MAXCHARNODES	(MAXCHARMAPS * CHARMAPLENGTH + 1)
-
-/*
- * A node for trie structure.
- */
-struct Charnode {
-	uint8_t code; /* the value in a key-value pair. */
-	uint8_t isCode; /* has 1 if it's a code node, not just a bridge node. */
-	struct Charnode *next[256]; /* each index representing the next possible
-				     * character from its current state.
-				     */
-};
-
-struct Charmap {
-	char name[MAXSYMLEN + 1];
-	int32_t charCount; /* user-side count. */
-	int32_t nodeCount; /* node-side count. */
-	struct Charnode nodes[MAXCHARNODES]; /* first node is reserved for the
-					      * root node in charmap.
-					      */
-};
-
-void charmap_InitMain(void);
 struct Charmap *charmap_New(const char *name, const char *baseName);
+void charmap_Delete(struct Charmap *charmap);
 void charmap_Set(const char *name);
 void charmap_Push(void);
 void charmap_Pop(void);
-int32_t charmap_Add(char *input, uint8_t output);
-int32_t charmap_Convert(char **input);
+void charmap_Add(char *mapping, uint8_t value);
+size_t charmap_Convert(char const *input, uint8_t *output);
 
 #endif /* RGBDS_ASM_CHARMAP_H */
--- a/include/asm/util.h
+++ b/include/asm/util.h
@@ -12,6 +12,6 @@
 #include <stdint.h>
 
 uint32_t calchash(const char *s);
-int32_t readUTF8Char(char *dest, char *src);
+size_t readUTF8Char(uint8_t *dest, char const *src);
 
 #endif /* RGBDS_UTIL_H */
--- a/include/asm/warning.h
+++ b/include/asm/warning.h
@@ -14,19 +14,19 @@
 extern unsigned int nbErrors;
 
 enum WarningID {
-	WARNING_ASSERT,		/* Assertions */
-	WARNING_BUILTIN_ARG,	/* Invalid args to builtins */
-	WARNING_DIV,		/* Division undefined behavior */
-	WARNING_EMPTY_DATA_DIRECTIVE,
-				/* `db`, `dw` or `dl` with no directive in ROM */
-	WARNING_EMPTY_ENTRY,	/* Empty entry in `db`, `dw` or `dl` */
-	WARNING_LARGE_CONSTANT,	/* Constants too large */
-	WARNING_LONG_STR,	/* String too long for internal buffers */
-	WARNING_OBSOLETE,	/* Obsolete things */
-	WARNING_SHIFT,		/* Shifting undefined behavior */
-	WARNING_SHIFT_AMOUNT,	/* Strange shift amount */
-	WARNING_TRUNCATION,	/* Implicit truncation loses some bits */
-	WARNING_USER,		/* User warnings */
+	WARNING_ASSERT,		      /* Assertions */
+	WARNING_BUILTIN_ARG,	      /* Invalid args to builtins */
+	WARNING_CHARMAP_REDEF,        /* Charmap entry re-definition */
+	WARNING_DIV,		      /* Division undefined behavior */
+	WARNING_EMPTY_DATA_DIRECTIVE, /* `db`, `dw` or `dl` with no directive in ROM */
+	WARNING_EMPTY_ENTRY,	      /* Empty entry in `db`, `dw` or `dl` */
+	WARNING_LARGE_CONSTANT,	      /* Constants too large */
+	WARNING_LONG_STR,	      /* String too long for internal buffers */
+	WARNING_OBSOLETE,	      /* Obsolete things */
+	WARNING_SHIFT,		      /* Shifting undefined behavior */
+	WARNING_SHIFT_AMOUNT,	      /* Strange shift amount */
+	WARNING_TRUNCATION,	      /* Implicit truncation loses some bits */
+	WARNING_USER,		      /* User warnings */
 
 	NB_WARNINGS,
 
--- a/src/asm/asmy.y
+++ b/src/asm/asmy.y
@@ -96,7 +96,7 @@
 	return length;
 }
 
-static uint32_t str2int2(char *s, int32_t length)
+static uint32_t str2int2(uint8_t *s, int32_t length)
 {
 	int32_t i;
 	uint32_t r = 0;
@@ -104,7 +104,7 @@
 	i = length < 4 ? 0 : length - 4;
 	while (i < length) {
 		r <<= 8;
-		r |= (uint8_t)s[i];
+		r |= s[i];
 		i++;
 	}
 
@@ -1023,9 +1023,7 @@
 charmap		: T_POP_CHARMAP string ',' const {
 			if ($4 < INT8_MIN || $4 > UINT8_MAX)
 				warning(WARNING_TRUNCATION, "Expression must be 8-bit\n");
-
-			if (charmap_Add($2, (uint8_t)$4) == -1)
-				error("Error adding new charmap mapping: %s\n", strerror(errno));
+			charmap_Add($2, (uint8_t)$4);
 		}
 ;
 
@@ -1132,11 +1130,11 @@
 		}
 		| reloc_8bit_no_str	{ out_RelByte(&$1); }
 		| string {
-			char *s = $1;
-			int32_t length = charmap_Convert(&s);
+			uint8_t *output = malloc(strlen($1)); /* Cannot be larger than that */
+			int32_t length = charmap_Convert($1, output);
 
-			out_AbsByteGroup((uint8_t*)s, length);
-			free(s);
+			out_AbsByteGroup(output, length);
+			free(output);
 		}
 ;
 
@@ -1189,11 +1187,11 @@
 
 relocexpr	: relocexpr_no_str
 		| string {
-			char *s = $1;
-			int32_t length = charmap_Convert(&s);
-			uint32_t r = str2int2(s, length);
+			uint8_t *output = malloc(strlen($1)); /* Cannot be longer than that */
+			int32_t length = charmap_Convert($1, output);
+			uint32_t r = str2int2(output, length);
 
-			free(s);
+			free(output);
 			rpn_Number(&$$, r);
 		}
 ;
--- a/src/asm/charmap.c
+++ b/src/asm/charmap.c
@@ -22,18 +22,36 @@
 
 #include "hashmap.h"
 
-#define CHARMAP_HASH_SIZE (1 << 9)
+/*
+ * Charmaps are stored using a structure known as "trie".
+ * Essentially a tree, where each nodes stores a single character's worth of info:
+ * whether there exists a mapping that ends at the current character,
+ */
+struct Charnode {
+	bool isTerminal; /* Whether there exists a mapping that ends here */
+	uint8_t value; /* If the above is true, its corresponding value */
+	/* This MUST be indexes and not pointers, because pointers get invalidated by `realloc`!! */
+	size_t next[255]; /* Indexes of where to go next, 0 = nowhere */
+};
 
-struct CharmapStackEntry {
-	struct Charmap *charmap;
-	struct CharmapStackEntry *next;
+#define INITIAL_CAPACITY 32
+
+struct Charmap {
+	char *name;
+	size_t usedNodes; /* How many nodes are being used */
+	size_t capacity; /* How many nodes have been allocated */
+	struct Charnode nodes[]; /* first node is reserved for the root node */
 };
 
 static HashMap charmaps;
 
-static struct Charmap *mainCharmap;
 static struct Charmap *currentCharmap;
 
+struct CharmapStackEntry {
+	struct Charmap *charmap;
+	struct CharmapStackEntry *next;
+};
+
 struct CharmapStackEntry *charmapStack;
 
 static inline struct Charmap *charmap_Get(const char *name)
@@ -41,18 +59,23 @@
 	return hash_GetElement(charmaps, name);
 }
 
-static void CopyNode(struct Charmap *dest,
-		     const struct Charmap *src,
-		     int nodeIdx)
+static inline struct Charmap *resizeCharmap(struct Charmap *map, size_t capacity)
 {
-	dest->nodes[nodeIdx].code = src->nodes[nodeIdx].code;
-	dest->nodes[nodeIdx].isCode = src->nodes[nodeIdx].isCode;
-	for (int i = 0; i < 256; i++)
-		if (src->nodes[nodeIdx].next[i])
-			dest->nodes[nodeIdx].next[i] = dest->nodes +
-				(src->nodes[nodeIdx].next[i] - src->nodes);
+	struct Charmap *new = realloc(map, sizeof(*map) + sizeof(*map->nodes) * capacity);
+
+	if (!new)
+		fatalerror("Failed to %s charmap: %s\n",
+			   map ? "create" : "resize", strerror(errno));
+	new->capacity = capacity;
+	return new;
 }
 
+static inline void initNode(struct Charnode *node)
+{
+	node->isTerminal = false;
+	memset(node->next, 0, sizeof(node->next));
+}
+
 struct Charmap *charmap_New(const char *name, const char *baseName)
 {
 	struct Charmap *base = NULL;
@@ -66,28 +89,23 @@
 
 	struct Charmap *charmap = charmap_Get(name);
 
-	if (charmap != NULL) {
+	if (charmap) {
 		error("Charmap '%s' already exists\n", name);
-		return NULL;
+		return charmap;
 	}
 
-	charmap = malloc(sizeof(*charmap));
-	if (charmap == NULL)
-		fatalerror("Failed to create charmap: %s\n", strerror(errno));
-
 	/* Init the new charmap's fields */
-	snprintf(charmap->name, sizeof(charmap->name), "%s", name);
-	if (base != NULL) {
-		charmap->charCount = base->charCount;
-		charmap->nodeCount = base->nodeCount;
+	if (base) {
+		charmap = resizeCharmap(NULL, base->capacity);
+		charmap->usedNodes = base->usedNodes;
 
-		for (int i = 0; i < MAXCHARNODES; i++)
-			CopyNode(charmap, base, i);
+		memcpy(charmap->nodes, base->nodes, sizeof(base->nodes[0]) * charmap->usedNodes);
 	} else {
-		charmap->charCount = 0;
-		charmap->nodeCount = 0;
-		memset(charmap->nodes, 0, sizeof(charmap->nodes));
+		charmap = resizeCharmap(NULL, INITIAL_CAPACITY);
+		charmap->usedNodes = 1;
+		initNode(&charmap->nodes[0]); /* Init the root node */
 	}
+	charmap->name = strdup(name);
 
 	hash_AddElement(charmaps, charmap->name, charmap);
 	currentCharmap = charmap;
@@ -95,6 +113,12 @@
 	return charmap;
 }
 
+void charmap_Delete(struct Charmap *charmap)
+{
+	free(charmap->name);
+	free(charmap);
+}
+
 void charmap_Set(const char *name)
 {
 	struct Charmap *charmap = charmap_Get(name);
@@ -109,9 +133,9 @@
 {
 	struct CharmapStackEntry *stackEntry;
 
-	stackEntry = malloc(sizeof(struct CharmapStackEntry));
+	stackEntry = malloc(sizeof(*stackEntry));
 	if (stackEntry == NULL)
-		fatalerror("No memory for charmap stack\n");
+		fatalerror("Failed to alloc charmap stack entry: %s\n", strerror(errno));
 
 	stackEntry->charmap = currentCharmap;
 	stackEntry->next = charmapStack;
@@ -121,8 +145,10 @@
 
 void charmap_Pop(void)
 {
-	if (charmapStack == NULL)
-		fatalerror("No entries in the charmap stack\n");
+	if (charmapStack == NULL) {
+		error("No entries in the charmap stack\n");
+		return;
+	}
 
 	struct CharmapStackEntry *top = charmapStack;
 
@@ -131,109 +157,86 @@
 	free(top);
 }
 
-void charmap_InitMain(void)
+void charmap_Add(char *mapping, uint8_t value)
 {
-	mainCharmap = charmap_New("main", NULL);
-}
+	struct Charnode *node = &currentCharmap->nodes[0];
 
-int32_t charmap_Add(char *input, uint8_t output)
-{
-	int32_t i;
-	uint8_t v;
+	for (uint8_t c; *mapping; mapping++) {
+		c = *mapping - 1;
 
-	struct Charmap  *charmap = currentCharmap;
-	struct Charnode *curr_node, *temp_node;
-
-	curr_node = &charmap->nodes[0];
-
-	for (i = 0; (v = (uint8_t)input[i]); i++) {
-		if (curr_node->next[v]) {
-			curr_node = curr_node->next[v];
+		if (node->next[c]) {
+			node = &currentCharmap->nodes[node->next[c]];
 		} else {
-			temp_node = &charmap->nodes[charmap->nodeCount + 1];
+			/* Register next available node */
+			node->next[c] = currentCharmap->usedNodes;
+			/* If no more nodes are available, get new ones */
+			if (currentCharmap->usedNodes == currentCharmap->capacity) {
+				currentCharmap->capacity *= 2;
+				currentCharmap = resizeCharmap(currentCharmap, currentCharmap->capacity);
+			}
 
-			curr_node->next[v] = temp_node;
-			curr_node = temp_node;
-
-			++charmap->nodeCount;
+			/* Switch to and init new node */
+			node = &currentCharmap->nodes[currentCharmap->usedNodes++];
+			initNode(node);
 		}
 	}
 
-	/* prevent duplicated keys by accepting only first key-value pair.  */
-	if (curr_node->isCode)
-		return charmap->charCount;
+	if (node->isTerminal)
+		warning(WARNING_CHARMAP_REDEF, "Overriding charmap mapping");
 
-	curr_node->code = output;
-	curr_node->isCode = 1;
-
-	return ++charmap->charCount;
+	node->isTerminal = true;
+	node->value = value;
 }
 
-int32_t charmap_Convert(char **input)
+size_t charmap_Convert(char const *input, uint8_t *output)
 {
-	struct Charmap  *charmap = currentCharmap;
-	struct Charnode *charnode;
+	/*
+	 * The goal is to match the longest mapping possible.
+	 * For that, advance through the trie with each character read.
+	 * If that would lead to a dead end, rewind characters until the last match, and output.
+	 * If no match, read a UTF-8 codepoint and output that.
+	 */
+	size_t outputLen = 0;
+	struct Charnode const *node = &currentCharmap->nodes[0];
+	struct Charnode const *match = NULL;
+	size_t rewindDistance = 0;
 
-	char *output;
-	char outchar[8];
+	for (;;) {
+		/* We still want NULs to reach the `else` path, to give a chance to rewind */
+		uint8_t c = *input - 1;
 
-	int32_t i, match, length;
-	uint8_t v, foundCode;
+		if (*input && node->next[c]) {
+			input++; /* Consume that char */
+			rewindDistance++;
 
-	output = malloc(strlen(*input));
-	if (output == NULL)
-		fatalerror("Not enough memory for charmap conversion buffer: %s\n",
-			   strerror(errno));
+			node = &currentCharmap->nodes[node->next[c]];
+			if (node->isTerminal) {
+				match = node;
+				rewindDistance = 0; /* Rewind from after the match */
+			}
 
-	length = 0;
+		} else {
+			input -= rewindDistance; /* Rewind */
+			rewindDistance = 0;
+			node = &currentCharmap->nodes[0];
 
-	while (**input) {
-		charnode = &charmap->nodes[0];
+			if (match) { /* Arrived at a dead end with a match found */
+				*output++ = match->value;
+				outputLen++;
+				match = NULL; /* Reset match for next round */
 
-		/*
-		 * Find the longest valid match which has been registered in
-		 * charmap, possibly yielding multiple or no matches.
-		 * The longest match is taken, meaning partial matches shorter
-		 * than the longest one are ignored.
-		 */
-		for (i = match = 0; (v = (*input)[i]);) {
-			if (!charnode->next[v])
-				break;
+			} else if (*input) { /* No match found */
+				size_t codepointLen = readUTF8Char(output, input);
 
-			charnode = charnode->next[v];
-			i++;
-
-			if (charnode->isCode) {
-				match = i;
-				foundCode = charnode->code;
+				input += codepointLen; /* OK because UTF-8 has no NUL in multi-byte chars */
+				output += codepointLen;
+				outputLen += codepointLen;
 			}
-		}
 
-		if (match) {
-			output[length] = foundCode;
-
-			length++;
-		} else {
-			/*
-			 * put a utf-8 character
-			 * if failed to find a match.
-			 */
-			match = readUTF8Char(outchar, *input);
-
-			if (match) {
-				memcpy(output + length, *input, match);
-			} else {
-				output[length] = 0;
-				match = 1;
-			}
-
-			length += match;
+			if (!*input)
+				break;
 		}
-
-		*input += match;
 	}
 
-	*input = output;
-
-	return length;
+	return outputLen;
 }
--- a/src/asm/main.c
+++ b/src/asm/main.c
@@ -538,7 +538,7 @@
 	sym_SetExportAll(exportall);
 	fstk_Init(tzMainfile);
 	opt_ParseDefines();
-	charmap_InitMain();
+	charmap_New("main", NULL);
 
 	yy_set_state(LEX_STATE_NORMAL);
 	opt_SetCurrentOptions(&DefaultOptions);
--- a/src/asm/rgbasm.1
+++ b/src/asm/rgbasm.1
@@ -203,6 +203,10 @@
 with indexes outside of the string's bounds.
 This warning is enabled by
 .Fl Wall .
+.It Fl Wcharmap-redef
+Warn when re-defining a charmap mapping.
+This warning is enabled by
+.Fl Wall .
 .It Fl Wdiv
 Warn when dividing the smallest negative integer by -1, which yields itself due to integer overflow.
 .It Fl Wempty-entry
--- a/src/asm/util.c
+++ b/src/asm/util.c
@@ -27,21 +27,20 @@
 	return hash;
 }
 
-int32_t readUTF8Char(char *dest, char *src)
+size_t readUTF8Char(uint8_t *dest, char const *src)
 {
-	uint32_t state;
+	uint32_t state = 0;
 	uint32_t codep;
-	int32_t i;
+	size_t i = 0;
 
-	for (i = 0, state = 0;; i++) {
+	for (;;) {
 		if (decode(&state, &codep, (uint8_t)src[i]) == 1)
 			fatalerror("invalid UTF-8 character\n");
 
 		dest[i] = src[i];
+		i++;
 
-		if (state == 0) {
-			dest[++i] = '\0';
+		if (state == 0)
 			return i;
-		}
 	}
 }
--- a/src/asm/warning.c
+++ b/src/asm/warning.c
@@ -30,6 +30,7 @@
 static enum WarningState const defaultWarnings[NB_WARNINGS] = {
 	[WARNING_ASSERT]		= WARNING_ENABLED,
 	[WARNING_BUILTIN_ARG]		= WARNING_DISABLED,
+	[WARNING_CHARMAP_REDEF]		= WARNING_DISABLED,
 	[WARNING_DIV]			= WARNING_DISABLED,
 	[WARNING_EMPTY_DATA_DIRECTIVE]	= WARNING_DISABLED,
 	[WARNING_EMPTY_ENTRY]		= WARNING_DISABLED,
@@ -68,6 +69,7 @@
 static char const *warningFlags[NB_WARNINGS_ALL] = {
 	"assert",
 	"builtin-args",
+	"charmap-redef",
 	"div",
 	"empty-data-directive",
 	"empty-entry",
@@ -92,6 +94,7 @@
 /* Warnings that probably indicate an error */
 static uint8_t const _wallCommands[] = {
 	WARNING_BUILTIN_ARG,
+	WARNING_CHARMAP_REDEF,
 	WARNING_EMPTY_DATA_DIRECTIVE,
 	WARNING_LARGE_CONSTANT,
 	WARNING_LONG_STR,
--- a/test/asm/multiple-charmaps.err
+++ b/test/asm/multiple-charmaps.err
@@ -4,3 +4,4 @@
     Charmap 'map5' doesn't exist
 ERROR: multiple-charmaps.asm(104) -> multiple-charmaps.asm::pop_(23):
     No entries in the charmap stack
+error: Assembly aborted (3 errors)!