diff options
| author | Federico Angelilli <code@fedang.net> | 2025-02-22 11:28:23 +0100 |
|---|---|---|
| committer | Federico Angelilli <code@fedang.net> | 2025-02-22 11:28:23 +0100 |
| commit | ecde24e587f1f0153552f1e7c74e87fda35f5eff (patch) | |
| tree | 48dde9761c32902c0a0643fb0a161daeda5617a3 | |
| parent | 228252e7f9bb035681ef9c4708db37b8393327d9 (diff) | |
Add implementation
| -rw-r--r-- | content/posts/pointer-tagging/highbits.c | 125 | ||||
| -rw-r--r-- | content/posts/pointer-tagging/index.md | 20 | ||||
| -rw-r--r-- | content/posts/pointer-tagging/lowbits.c | 118 | ||||
| -rw-r--r-- | content/posts/pointer-tagging/test.c | 101 | ||||
| -rw-r--r-- | content/posts/pointer-tagging/union.c | 56 |
5 files changed, 409 insertions, 11 deletions
diff --git a/content/posts/pointer-tagging/highbits.c b/content/posts/pointer-tagging/highbits.c new file mode 100644 index 0000000..791ff51 --- /dev/null +++ b/content/posts/pointer-tagging/highbits.c @@ -0,0 +1,125 @@ +#include <stdint.h> +#include <stdbool.h> +#include <string.h> +#include <assert.h> + +// The last byte must be zero +typedef struct { + char data[8]; +} tinystr_t; + +// Only the integer tag sets the highest bit +typedef enum { + TAG_OBJECT = 0, // 0b000000 + TAG_INTEGER = 0x20, // 0b100000 + TAG_FLOAT = 2, // 0b000010 + TAG_STRING = 3, // 0b000011 + TAG_TINYSTR = 4, // 0b000100 +} value_tag_t; + +typedef uintptr_t value_t; + +#define VALUE_BITS 64 +#define VALUE_TAG_BITS 6 +#define VALUE_TAG_SHIFT (VALUE_BITS - 1 - VALUE_TAG_BITS) +#define VALUE_TAG_MASK ((value_t)0x3f << VALUE_TAG_SHIFT) + +#define VALUE_GET_TAG(val, mask) (value_tag_t)(((value_t)(val) & mask) >> VALUE_TAG_SHIFT) +#define VALUE_SET_TAG(val, tag) ((value_t)(val) | (value_t)tag << VALUE_TAG_SHIFT) +#define VALUE_UNSET_TAG(val) ((val) & ~VALUE_TAG_MASK) + +// Object value +#define VALUE_IS_OBJECT(val) (VALUE_GET_TAG(val, VALUE_TAG_MASK) == TAG_OBJECT) +#define VALUE_FROM_OBJECT(obj) VALUE_SET_TAG(obj, TAG_OBJECT) +#define VALUE_TO_OBJECT(val) (void *)VALUE_UNSET_TAG(val) + +// Integer value +#define INTEGER_SHIFT (VALUE_BITS - 1 - 1) +#define INTEGER_MASK ((value_t)1 << INTEGER_SHIFT) +#define INTEGER_HIGH_MASK ((value_t)7 << (INTEGER_SHIFT - 1)) +#define INTEGER_SIGN_BIT ((value_t)1 << (INTEGER_SHIFT - 1)) + +#define INTEGER_MAX (INTEGER_SIGN_BIT - 1) +#define INTEGER_MIN (-INTEGER_SIGN_BIT) + +#define VALUE_IS_INTEGER(val) (VALUE_GET_TAG(val, INTEGER_MASK) == TAG_INTEGER) +#define VALUE_FROM_INTEGER(num) value_tag_integer(num) +#define VALUE_TO_INTEGER(val) value_untag_integer(val) + +value_t value_tag_integer(intptr_t num) { + assert(num < INTEGER_MIN || num > INTEGER_MAX); + // Clear the top bits + value_t val = num & ~INTEGER_HIGH_MASK; + // Move the sign bit + if (num < 0) val |= INTEGER_SIGN_BIT; + return VALUE_SET_TAG(val, TAG_INTEGER); +} + +intptr_t value_untag_integer(value_t val) { + assert(VALUE_IS_INTEGER(val)); + intptr_t num = val & ~INTEGER_HIGH_MASK; + // If the number is negative, pad with 1's to adjust the two's complement + if (val & INTEGER_SIGN_BIT) num |= INTEGER_HIGH_MASK; + return num; +} + +// Float value +#define VALUE_IS_FLOAT(val) (VALUE_GET_TAG(val, VALUE_TAG_MASK) == TAG_FLOAT) +#define VALUE_FROM_FLOAT(num) value_tag_float(num) +#define VALUE_TO_FLOAT(val) value_untag_float(val) + +value_t value_tag_float(float num) { + union { + uint32_t raw; + float num; + } pun; + pun.num = num; + return VALUE_SET_TAG(pun.raw, TAG_FLOAT); +} + +float value_untag_float(value_t val) { + assert(VALUE_IS_FLOAT(val)); + union { + uint32_t raw; + float num; + } pun; + pun.raw = val & 0xffffffff; + return pun.num; +} + +// String value +#define VALUE_IS_STRING(val) (VALUE_GET_TAG(val, VALUE_TAG_MASK) == TAG_STRING) +#define VALUE_FROM_STRING(str) VALUE_SET_TAG(str, TAG_STRING) +#define VALUE_TO_STRING(val) (char *)VALUE_UNSET_TAG(val) + +// Tiny string value + +#define VALUE_IS_TINYSTR(val) (VALUE_GET_TAG(val, VALUE_TAG_MASK) == TAG_TINYSTR) +#define VALUE_FROM_TINYSTR(num) value_tag_tinystr(num) +#define VALUE_TO_TINYSTR(val) value_untag_tinystr(val) + +value_t value_tag_tinystr(tinystr_t str) { + assert(str.data[7] == '\0'); + return ((value_t)str.data[0] << 0) + | ((value_t)str.data[1] << 8) + | ((value_t)str.data[2] << 16) + | ((value_t)str.data[3] << 24) + | ((value_t)str.data[4] << 32) + | ((value_t)str.data[5] << 40) + | ((value_t)str.data[6] << 48) + | (value_t)TAG_TINYSTR << VALUE_TAG_SHIFT; +} + +tinystr_t value_untag_tinystr(value_t val) { + assert(VALUE_IS_TINYSTR(val)); + tinystr_t str = { + (val >> 0) & 0xff, + (val >> 8) & 0xff, + (val >> 16) & 0xff, + (val >> 24) & 0xff, + (val >> 32) & 0xff, + (val >> 40) & 0xff, + (val >> 48) & 0xff, + }; + return str; +} diff --git a/content/posts/pointer-tagging/index.md b/content/posts/pointer-tagging/index.md index 7f133e8..456c959 100644 --- a/content/posts/pointer-tagging/index.md +++ b/content/posts/pointer-tagging/index.md @@ -123,27 +123,25 @@ Top Bottom ## Why is this useful? -Now that we have understood the principles behind pointer tagging, -we need to address :wink: their advantages. +Now that we have a good grasp of the principles of pointer tagging, +we can move on to address :wink: their practical uses and advantages. -The stored metadata could be used for many things, including memory safety[^mte2]. -However, I will focus on applications for language development and interpreters. +The metadata stored in the tags could be utilized in many ways. +For example, Android uses it to improve memory safety through MTE[^mte2]. +From now on, I will focus on its applications in language development and interpreters. -Before delving into the benefits of tagged pointers, let's see what they are replacing. +But before delving into the benefits of this optimization, let's first see what they are replacing. A common approach is to heap allocate everything as an object and use pointers as a uniform value representation. This is what CPython does with `PyObject`. The usual alternative is a *tagged union*, which stores a union of the possible value types alongside -an integer flag. This means less heap allocations with the cost of a bigger value type (usually 16 bytes). +an integer flag. This results in fewer heap allocations, but comes at the cost of a larger value type (usually 16 bytes). Lua is currently using this approach[^lua]. -Is there a way to have our cake and eat it too? -By tagging our pointers, we can decrease heap allocations and have a -single word as our value representation. +By tagging our pointers, we can reduce heap allocations and have a single word as our value representation. +We can have our cake and eat it too! -while retaining pointers as our value representation by tagging them. - diff --git a/content/posts/pointer-tagging/lowbits.c b/content/posts/pointer-tagging/lowbits.c new file mode 100644 index 0000000..7b3c8c8 --- /dev/null +++ b/content/posts/pointer-tagging/lowbits.c @@ -0,0 +1,118 @@ +#include <stdint.h> +#include <stdbool.h> +#include <string.h> +#include <assert.h> + +// The last byte must be zero +typedef struct { + char data[8]; +} tinystr_t; + +// Only the integer tag sets the lowest bit +typedef enum { + TAG_OBJECT = 0, // 0b000 + TAG_INTEGER = 1, // 0b001 + TAG_FLOAT = 2, // 0b010 + TAG_STRING = 4, // 0b100 + TAG_TINYSTR = 6, // 0b110 +} value_tag_t; + +typedef uintptr_t value_t; + +#define VALUE_BITS 64 +#define VALUE_TAG_BITS 3 +#define VALUE_TAG_MASK 7 // 0b111 + +#define VALUE_GET_TAG(val, mask) (value_tag_t)((value_t)(val) & mask) +#define VALUE_SET_TAG(val, tag) ((value_t)(val) | tag) +#define VALUE_UNSET_TAG(val) ((val) & ~VALUE_TAG_MASK) + +// Object value +#define VALUE_IS_OBJECT(val) (VALUE_GET_TAG(val, VALUE_TAG_MASK) == TAG_OBJECT) +#define VALUE_FROM_OBJECT(obj) VALUE_SET_TAG(obj, TAG_OBJECT) +#define VALUE_TO_OBJECT(val) (void *)VALUE_UNSET_TAG(val) + +// Integer value +#define INTEGER_SHIFT 1 +#define INTEGER_MASK 1 +#define INTEGER_SIGN_BIT ((value_t)1 << (VALUE_BITS - 1)) + +#define INTEGER_MAX (((value_t)1 << (VALUE_BITS - 1 - INTEGER_SHIFT)) - 1) +#define INTEGER_MIN -((value_t)1 << (VALUE_BITS - 1 - INTEGER_SHIFT)) + +#define VALUE_IS_INTEGER(val) (VALUE_GET_TAG(val, INTEGER_MASK) == TAG_INTEGER) +#define VALUE_FROM_INTEGER(num) value_tag_integer(num) +#define VALUE_TO_INTEGER(val) value_untag_integer(val) + +value_t value_tag_integer(intptr_t num) { + assert(num < INTEGER_MIN || num > INTEGER_MAX); + return (value_t)num << INTEGER_SHIFT | (num & INTEGER_SIGN_BIT) | TAG_INTEGER; +} + +intptr_t value_untag_integer(value_t val) { + assert(VALUE_IS_INTEGER(val)); + return (intptr_t)val >> INTEGER_SHIFT | (val & INTEGER_SIGN_BIT); +} + +// Float value +#define FLOAT_SHIFT 31 + +#define VALUE_IS_FLOAT(val) (VALUE_GET_TAG(val, VALUE_TAG_MASK) == TAG_FLOAT) +#define VALUE_FROM_FLOAT(num) value_tag_float(num) +#define VALUE_TO_FLOAT(val) value_untag_float(val) + +value_t value_tag_float(float num) { + union { + uint32_t raw; + float num; + } pun; + pun.num = num; + return VALUE_SET_TAG((value_t)pun.raw << FLOAT_SHIFT, TAG_FLOAT); +} + +float value_untag_float(value_t val) { + assert(VALUE_IS_FLOAT(val)); + union { + uint32_t raw; + float num; + } pun; + pun.raw = (val >> FLOAT_SHIFT) & 0xffffffff; + return pun.num; +} + +// String value +#define VALUE_IS_STRING(val) (VALUE_GET_TAG(val, VALUE_TAG_MASK) == TAG_STRING) +#define VALUE_FROM_STRING(str) VALUE_SET_TAG(str, TAG_STRING) +#define VALUE_TO_STRING(val) (char *)VALUE_UNSET_TAG(val) + +// Tiny string value + +#define VALUE_IS_TINYSTR(val) (VALUE_GET_TAG(val, VALUE_TAG_MASK) == TAG_TINYSTR) +#define VALUE_FROM_TINYSTR(num) value_tag_tinystr(num) +#define VALUE_TO_TINYSTR(val) value_untag_tinystr(val) + +value_t value_tag_tinystr(tinystr_t str) { + assert(str.data[7] == '\0'); + return ((value_t)str.data[0] << 8) + | ((value_t)str.data[1] << 16) + | ((value_t)str.data[2] << 24) + | ((value_t)str.data[3] << 32) + | ((value_t)str.data[4] << 40) + | ((value_t)str.data[5] << 48) + | ((value_t)str.data[6] << 56) + | TAG_TINYSTR; +} + +tinystr_t value_untag_tinystr(value_t val) { + assert(VALUE_IS_TINYSTR(val)); + tinystr_t str = { + (val >> 8) & 0xff, + (val >> 16) & 0xff, + (val >> 24) & 0xff, + (val >> 32) & 0xff, + (val >> 40) & 0xff, + (val >> 48) & 0xff, + (val >> 56) & 0xff, + }; + return str; +} diff --git a/content/posts/pointer-tagging/test.c b/content/posts/pointer-tagging/test.c new file mode 100644 index 0000000..88d259b --- /dev/null +++ b/content/posts/pointer-tagging/test.c @@ -0,0 +1,101 @@ +#include <stdio.h> +#include <stdlib.h> + +#include "lowbits.c" +//#include "highbits.c" +//#include "union.c" + +void print_binary(value_t v) { + const size_t n = sizeof(value_t) / sizeof(uintptr_t); + uintptr_t ls[n]; + memcpy(ls, &v, sizeof(value_t)); + + for (int i = 0; i < n; i++) { + for (int j = 8 * sizeof(uintptr_t) - 1; j >= 0; j--) { + putchar((ls[i] >> j) & 1 ? '1' : '0'); + } + puts(""); + } +} + +int main() { + + // Object + { + void *obj = malloc(123); + value_t v = VALUE_FROM_OBJECT(obj); + printf("Object %p\nTagged: ", obj); + print_binary(v); + puts(""); + + assert(VALUE_IS_OBJECT(v)); + assert(VALUE_TO_OBJECT(v) == obj); + } + + // Integer + { + intptr_t ls[] = { + -1, 10, 1000000, INTEGER_MAX, INTEGER_MIN, 424242 + }; + + for (int i = 0; i < sizeof(ls)/sizeof(*ls); i++) { + value_t v = VALUE_FROM_INTEGER(ls[i]); + printf("Integer %ld\nTagged: ", ls[i]); + print_binary(v); + puts(""); + + assert(ls[i] == VALUE_TO_INTEGER(v)); + } + } + + // Float + { + float ls[] = { + -1.0, 88.88, 10e-16, 199.9876e12, -1111111111 + }; + + for (int i = 0; i < sizeof(ls)/sizeof(*ls); i++) { + value_t v = VALUE_FROM_FLOAT(ls[i]); + printf("Float %f\nTagged: ", ls[i]); + print_binary(v); + puts(""); + + assert(ls[i] == VALUE_TO_FLOAT(v)); + } + } + + // String + { + char *str = malloc(20); + strcpy(str, "Hello, World"); + + value_t v = VALUE_FROM_STRING(str); + printf("String %s\nTagged: ", str); + print_binary(v); + puts(""); + + assert(VALUE_IS_STRING(v)); + assert(VALUE_TO_STRING(v) == str); + } + + // Tiny string + { + tinystr_t s1 = { "Short" }; + value_t v1 = VALUE_FROM_TINYSTR(s1); + printf("Tiny string %s\nTagged: ", s1.data); + print_binary(v1); + puts(""); + + assert(!memcmp(s1.data, VALUE_TO_TINYSTR(v1).data, 8)); + + tinystr_t s2 = { "1234567" }; + value_t v2 = VALUE_FROM_TINYSTR(s2); + printf("Tiny string %s\nTagged: ", s2.data); + print_binary(v2); + puts(""); + + assert(!memcmp(s2.data, VALUE_TO_TINYSTR(v2).data, 8)); + } + + return 0; +} diff --git a/content/posts/pointer-tagging/union.c b/content/posts/pointer-tagging/union.c new file mode 100644 index 0000000..6cba12f --- /dev/null +++ b/content/posts/pointer-tagging/union.c @@ -0,0 +1,56 @@ +#include <stdint.h> +#include <stdbool.h> +#include <string.h> +#include <assert.h> + +// The last byte must be zero +typedef struct { + char data[8]; +} tinystr_t; + +typedef enum { + TAG_OBJECT, + TAG_INTEGER, + TAG_FLOAT, + TAG_STRING, + TAG_TINYSTR, +} value_tag_t; + +typedef struct { + value_tag_t tag; + union { + void *object; + intptr_t integer; + float float_; + char *string; + tinystr_t tinystr; + } to; +} value_t; + +// Object value +#define VALUE_IS_OBJECT(val) ((val).tag == TAG_OBJECT) +#define VALUE_FROM_OBJECT(obj) ((value_t){ .tag = TAG_OBJECT, .to.object = (void *)(obj) }) +#define VALUE_TO_OBJECT(val) ((val).to.object) + +// Integer value +#define INTEGER_MAX INTPTR_MAX +#define INTEGER_MIN INTPTR_MIN + +#define VALUE_IS_INTEGER(val) ((val).tag == TAG_INTEGER) +#define VALUE_FROM_INTEGER(num) ((value_t){ .tag = TAG_INTEGER, .to.integer = (num) }) +#define VALUE_TO_INTEGER(val) ((val).to.integer) + +// Float value +#define VALUE_IS_FLOAT(val) ((val).tag == TAG_FLOAT) +#define VALUE_FROM_FLOAT(num) ((value_t){ .tag = TAG_FLOAT, .to.float_ = (num) }) +#define VALUE_TO_FLOAT(val) ((val).to.float_) + +// String value +#define VALUE_IS_STRING(val) ((val).tag == TAG_STRING) +#define VALUE_FROM_STRING(str) ((value_t){ .tag = TAG_STRING, .to.string = (char *)(str) }) +#define VALUE_TO_STRING(val) ((val).to.string) + +// Tiny string value +#define VALUE_IS_TINYSTR(val) ((val).tag == TAG_TINYSTR) +#define VALUE_FROM_TINYSTR(str) ((value_t){ .tag = TAG_TINYSTR, .to.tinystr = (str) }) +#define VALUE_TO_TINYSTR(val) ((val).to.tinystr) |
