Skip to content

Commit fa21da6

Browse files
committed
Unpacker: key_cache option.
Heavily inspired from ruby/json#675. When parsing documents with the same structure repeatedly, a lot of time can be saved by keeping a small cache of map keys encountered. Using the existing `bench/bench.rb`, comparing with and without the cache shows a 30% performance improvement: ``` Calculating ------------------------------------- unpack-pooled 960.380k (± 1.4%) i/s - 4.865M in 5.066600s unpack-key-cache 1.245M (± 1.6%) i/s - 6.232M in 5.009060s Comparison: unpack-pooled: 960379.8 i/s unpack-key-cache: 1244517.6 i/s - 1.30x (± 0.00) faster ``` However, on the same benchmark, but with the cache filled with other keys, the performance is notably degraded: ``` Calculating ------------------------------------- unpack-pooled 926.849k (± 2.1%) i/s - 4.639M in 5.007333s unpack-key-cache 822.266k (± 2.4%) i/s - 4.113M in 5.004645s Comparison: unpack-pooled: 926849.2 i/s unpack-key-cache: 822265.6 i/s - 1.13x (± 0.00) slower ``` So this feature is powerful but situational.
1 parent 83a2600 commit fa21da6

File tree

5 files changed

+211
-8
lines changed

5 files changed

+211
-8
lines changed

doclib/msgpack/unpacker.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ class Unpacker
1919
# Supported options:
2020
#
2121
# * *:symbolize_keys* deserialize keys of Hash objects as Symbol instead of String
22+
# * *:freeze* freeze the deserialized objects. Can allow string deduplication and some allocation elision.
23+
# * *:key_cache* Enable caching of map keys, this can improve performance significantly if the same map keys are frequently encountered, but also degrade performance if that's not the case.
2224
# * *:allow_unknown_ext* allow to deserialize ext type object with unknown type id as ExtensionValue instance. Otherwise (by default), unpacker throws UnknownExtTypeError.
2325
#
2426
# See also Buffer#initialize for other options.

ext/msgpack/buffer.h

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,4 +473,166 @@ static inline VALUE msgpack_buffer_read_top_as_symbol(msgpack_buffer_t* b, size_
473473
return rb_str_intern(msgpack_buffer_read_top_as_string(b, length, true, utf8));
474474
}
475475

476+
// Object names are likely to be repeated, and are frozen.
477+
// As such we can re-use them if we keep a cache of the ones we've seen so far,
478+
// and save much more expensive lookups into the global fstring table.
479+
// This cache implementation is deliberately simple, as we're optimizing for compactness,
480+
// to be able to fit safely on the stack.
481+
// As such, binary search into a sorted array gives a good tradeoff between compactness and
482+
// performance.
483+
#define MSGPACK_KEY_CACHE_CAPACITY 63
484+
#define MSGPACK_KEY_CACHE_MAX_ENTRY_LENGTH 55
485+
486+
typedef struct msgpack_key_cache_t msgpack_key_cache_t;
487+
struct msgpack_key_cache_t {
488+
int length;
489+
VALUE entries[MSGPACK_KEY_CACHE_CAPACITY];
490+
};
491+
492+
static inline VALUE build_interned_string(const char *str, const long length)
493+
{
494+
# ifdef HAVE_RB_ENC_INTERNED_STR
495+
return rb_enc_interned_str(str, length, rb_utf8_encoding());
496+
# else
497+
VALUE rstring = rb_utf8_str_new(str, length);
498+
return rb_funcall(rb_str_freeze(rstring), i_uminus, 0);
499+
# endif
500+
}
501+
502+
static inline VALUE build_symbol(const char *str, const long length)
503+
{
504+
return rb_str_intern(build_interned_string(str, length));
505+
}
506+
507+
static void rvalue_cache_insert_at(msgpack_key_cache_t *cache, int index, VALUE rstring)
508+
{
509+
MEMMOVE(&cache->entries[index + 1], &cache->entries[index], VALUE, cache->length - index);
510+
cache->length++;
511+
cache->entries[index] = rstring;
512+
}
513+
514+
static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
515+
{
516+
long rstring_length = RSTRING_LEN(rstring);
517+
if (length == rstring_length) {
518+
return memcmp(str, RSTRING_PTR(rstring), length);
519+
} else {
520+
return (int)(length - rstring_length);
521+
}
522+
}
523+
524+
static VALUE rstring_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
525+
{
526+
if (RB_UNLIKELY(length > MSGPACK_KEY_CACHE_MAX_ENTRY_LENGTH)) {
527+
// Common names aren't likely to be very long. So we just don't
528+
// cache names above an arbitrary threshold.
529+
return Qfalse;
530+
}
531+
532+
if (RB_UNLIKELY(!isalpha(str[0]))) {
533+
// Simple heuristic, if the first character isn't a letter,
534+
// we're much less likely to see this string again.
535+
// We mostly want to cache strings that are likely to be repeated.
536+
return Qfalse;
537+
}
538+
539+
int low = 0;
540+
int high = cache->length - 1;
541+
int mid = 0;
542+
int last_cmp = 0;
543+
544+
while (low <= high) {
545+
mid = (high + low) >> 1;
546+
VALUE entry = cache->entries[mid];
547+
last_cmp = rstring_cache_cmp(str, length, entry);
548+
549+
if (last_cmp == 0) {
550+
return entry;
551+
} else if (last_cmp > 0) {
552+
low = mid + 1;
553+
} else {
554+
high = mid - 1;
555+
}
556+
}
557+
558+
VALUE rstring = build_interned_string(str, length);
559+
560+
if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
561+
if (last_cmp > 0) {
562+
mid += 1;
563+
}
564+
565+
rvalue_cache_insert_at(cache, mid, rstring);
566+
}
567+
return rstring;
568+
}
569+
570+
static VALUE rsymbol_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
571+
{
572+
if (RB_UNLIKELY(length > MSGPACK_KEY_CACHE_MAX_ENTRY_LENGTH)) {
573+
// Common names aren't likely to be very long. So we just don't
574+
// cache names above an arbitrary threshold.
575+
return Qfalse;
576+
}
577+
578+
if (RB_UNLIKELY(!isalpha(str[0]))) {
579+
// Simple heuristic, if the first character isn't a letter,
580+
// we're much less likely to see this string again.
581+
// We mostly want to cache strings that are likely to be repeated.
582+
return Qfalse;
583+
}
584+
585+
int low = 0;
586+
int high = cache->length - 1;
587+
int mid = 0;
588+
int last_cmp = 0;
589+
590+
while (low <= high) {
591+
mid = (high + low) >> 1;
592+
VALUE entry = cache->entries[mid];
593+
last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
594+
595+
if (last_cmp == 0) {
596+
return entry;
597+
} else if (last_cmp > 0) {
598+
low = mid + 1;
599+
} else {
600+
high = mid - 1;
601+
}
602+
}
603+
604+
VALUE rsymbol = build_symbol(str, length);
605+
606+
if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
607+
if (last_cmp > 0) {
608+
mid += 1;
609+
}
610+
611+
rvalue_cache_insert_at(cache, mid, rsymbol);
612+
}
613+
return rsymbol;
614+
}
615+
616+
static inline VALUE msgpack_buffer_read_top_as_interned_symbol(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
617+
{
618+
VALUE result = rsymbol_cache_fetch(cache, b->read_buffer, length);
619+
if (RB_LIKELY(result)) {
620+
_msgpack_buffer_consumed(b, length);
621+
} else {
622+
result = msgpack_buffer_read_top_as_symbol(b, length, true);
623+
}
624+
return result;
625+
}
626+
627+
static inline VALUE msgpack_buffer_read_top_as_interned_string(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
628+
{
629+
VALUE result = rstring_cache_fetch(cache, b->read_buffer, length);
630+
if (RB_LIKELY(result)) {
631+
_msgpack_buffer_consumed(b, length);
632+
} else {
633+
result = msgpack_buffer_read_top_as_string(b, length, true, true);
634+
}
635+
return result;
636+
}
637+
476638
#endif

ext/msgpack/unpacker.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,20 @@ void msgpack_unpacker_mark_stack(msgpack_unpacker_stack_t* stack)
130130
}
131131
}
132132

133+
void msgpack_unpacker_mark_key_cache(msgpack_key_cache_t *cache)
134+
{
135+
int index;
136+
for (index = 0; index < cache->length; index++) {
137+
rb_gc_mark(cache->entries[index]);
138+
}
139+
}
140+
133141
void msgpack_unpacker_mark(msgpack_unpacker_t* uk)
134142
{
135143
rb_gc_mark(uk->last_object);
136144
rb_gc_mark(uk->reading_raw);
137145
msgpack_unpacker_mark_stack(&uk->stack);
146+
msgpack_unpacker_mark_key_cache(&uk->key_cache);
138147
/* See MessagePack_Buffer_wrap */
139148
/* msgpack_buffer_mark(UNPACKER_BUFFER_(uk)); */
140149
rb_gc_mark(uk->buffer_ref);
@@ -374,15 +383,32 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type)
374383
size_t length = uk->reading_raw_remaining;
375384
if(length <= msgpack_buffer_top_readable_size(UNPACKER_BUFFER_(uk))) {
376385
int ret;
377-
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type) || (uk->symbolize_keys && is_reading_map_key(uk))) {
386+
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type)) {
378387
VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type != RAW_TYPE_BINARY);
379388
ret = object_complete_symbol(uk, symbol);
389+
} else if (is_reading_map_key(uk) && raw_type == RAW_TYPE_STRING) {
390+
/* don't use zerocopy for hash keys but get a frozen string directly
391+
* because rb_hash_aset freezes keys and it causes copying */
392+
VALUE key;
393+
if (uk->symbolize_keys) {
394+
if (uk->use_key_cache) {
395+
key = msgpack_buffer_read_top_as_interned_symbol(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
396+
} else {
397+
key = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, true);
398+
}
399+
ret = object_complete_symbol(uk, key);
400+
} else {
401+
if (uk->use_key_cache) {
402+
key = msgpack_buffer_read_top_as_interned_string(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
403+
} else {
404+
key = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, true, true);
405+
}
406+
407+
ret = object_complete(uk, key);
408+
}
380409
} else {
381410
bool will_freeze = uk->freeze;
382411
if(raw_type == RAW_TYPE_STRING || raw_type == RAW_TYPE_BINARY) {
383-
/* don't use zerocopy for hash keys but get a frozen string directly
384-
* because rb_hash_aset freezes keys and it causes copying */
385-
will_freeze = will_freeze || is_reading_map_key(uk);
386412
VALUE string = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, will_freeze, raw_type == RAW_TYPE_STRING);
387413
ret = object_complete(uk, string);
388414
} else {

ext/msgpack/unpacker.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct msgpack_unpacker_stack_t {
5050
struct msgpack_unpacker_t {
5151
msgpack_buffer_t buffer;
5252
msgpack_unpacker_stack_t stack;
53+
msgpack_key_cache_t key_cache;
5354

5455
VALUE self;
5556
VALUE last_object;
@@ -66,10 +67,12 @@ struct msgpack_unpacker_t {
6667

6768
/* options */
6869
int symbol_ext_type;
69-
bool symbolize_keys;
70-
bool freeze;
71-
bool allow_unknown_ext;
72-
bool optimized_symbol_ext_type;
70+
71+
bool use_key_cache: 1;
72+
bool symbolize_keys: 1;
73+
bool freeze: 1;
74+
bool allow_unknown_ext: 1;
75+
bool optimized_symbol_ext_type: 1;
7376
};
7477

7578
#define UNPACKER_BUFFER_(uk) (&(uk)->buffer)
@@ -101,6 +104,11 @@ static inline void msgpack_unpacker_set_symbolized_keys(msgpack_unpacker_t* uk,
101104
uk->symbolize_keys = enable;
102105
}
103106

107+
static inline void msgpack_unpacker_set_key_cache(msgpack_unpacker_t* uk, bool enable)
108+
{
109+
uk->use_key_cache = enable;
110+
}
111+
104112
static inline void msgpack_unpacker_set_freeze(msgpack_unpacker_t* uk, bool enable)
105113
{
106114
uk->freeze = enable;

ext/msgpack/unpacker_class.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ static VALUE eUnknownExtTypeError;
3434
static VALUE mTypeError; // obsoleted. only for backward compatibility. See #86.
3535

3636
static VALUE sym_symbolize_keys;
37+
static VALUE sym_key_cache;
3738
static VALUE sym_freeze;
3839
static VALUE sym_allow_unknown_ext;
3940

@@ -128,6 +129,9 @@ VALUE MessagePack_Unpacker_initialize(int argc, VALUE* argv, VALUE self)
128129
if(options != Qnil) {
129130
VALUE v;
130131

132+
v = rb_hash_aref(options, sym_key_cache);
133+
msgpack_unpacker_set_key_cache(uk, RTEST(v));
134+
131135
v = rb_hash_aref(options, sym_symbolize_keys);
132136
msgpack_unpacker_set_symbolized_keys(uk, RTEST(v));
133137

@@ -413,6 +417,7 @@ void MessagePack_Unpacker_module_init(VALUE mMessagePack)
413417
eUnknownExtTypeError = rb_define_class_under(mMessagePack, "UnknownExtTypeError", eUnpackError);
414418

415419
sym_symbolize_keys = ID2SYM(rb_intern("symbolize_keys"));
420+
sym_key_cache = ID2SYM(rb_intern("key_cache"));
416421
sym_freeze = ID2SYM(rb_intern("freeze"));
417422
sym_allow_unknown_ext = ID2SYM(rb_intern("allow_unknown_ext"));
418423

0 commit comments

Comments
 (0)