From 767136a7296ec75edb188d4bba456d5be469a5bc Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 5 Feb 2022 21:31:28 +0900 Subject: [PATCH 1/6] Implement vector based implementation for small dict. --- Include/internal/pycore_dict.h | 2 + Objects/dictobject.c | 302 ++++++++++++++++++++++++++++++--- 2 files changed, 276 insertions(+), 28 deletions(-) diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h index 64d70d187df4eb..1b0c22a7fc5ab1 100644 --- a/Include/internal/pycore_dict.h +++ b/Include/internal/pycore_dict.h @@ -130,6 +130,8 @@ struct _dictvalues { #endif #define DK_ENTRIES(dk) \ ((PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[DK_SIZE(dk) * DK_IXSIZE(dk)])) +#define DK_ENTRIES_MIN(dk) \ + ((PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[8])) extern uint64_t _pydict_global_version; diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 63e3eda49881a8..166f6a8f32e9e4 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -240,6 +240,7 @@ uint64_t _pydict_global_version = 0; #include "clinic/dictobject.c.h" +#define DK_ISVECTOR(dk) ((dk)->dk_log2_size == PyDict_LOG_MINSIZE) #if PyDict_MAXFREELIST > 0 static struct _Py_dict_state * @@ -377,6 +378,79 @@ dictkeys_set_index(PyDictKeysObject *keys, Py_ssize_t i, Py_ssize_t ix) } } +// 8x8 simd-like functions. +// https://graphics.stanford.edu/~seander/bithacks.html + +static const uint64_t lsb = 0x0101010101010101ull; +static const uint64_t msb = 0x8080808080808080ull; + +// 0x00 -> 0x80, otherwise -> 0x00. +// Caution: This function has false positive in limited case. +static inline uint64_t +haszero(uint64_t v) { + return (((v) - lsb) & ~(v) & msb); +} + +// Find n in x. n should be 7bit value. +static inline uint64_t +hasvalue(uint64_t x, uint8_t n) { + return haszero(x ^ (n*lsb)); +} + +/* CountTrailingZeroesNonzero64 + +This function is copied from: +https://github.com/abseil/abseil-cpp/blob/1ae9b71c474628d60eb251a3f62967fe64151bb2/absl/numeric/internal/bits.h#L273 + +License of this function: +// Copyright 2020 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +Modifications: + +* Port to C, remove some macros. + +*/ +static inline int +CountTrailingZeroesNonzero64(uint64_t x) { +#if (defined(__clang__) || defined(__GNUC__)) + return __builtin_ctzll(x); +#elif defined(_MSC_VER) && !defined(__clang__) && \ + (defined(_M_X64) || defined(_M_ARM64)) + unsigned long result = 0; + _BitScanForward64(&result, x); + return result; +#elif defined(_MSC_VER) && !defined(__clang__) + unsigned long result = 0; + if ((uint32_t)(x) == 0) { + _BitScanForward(&result, (uint32_t)(x >> 32)); + return result + 32; + } + _BitScanForward(&result, (unsigned long)(x)); + return result; +#else + int c = 63; + x &= ~x + 1; + if (x & 0x00000000FFFFFFFF) c -= 32; + if (x & 0x0000FFFF0000FFFF) c -= 16; + if (x & 0x00FF00FF00FF00FF) c -= 8; + if (x & 0x0F0F0F0F0F0F0F0F) c -= 4; + if (x & 0x3333333333333333) c -= 2; + if (x & 0x5555555555555555) c -= 1; + return c; +#endif +} /* USABLE_FRACTION is the maximum dictionary load. * Increasing this ratio makes dictionaries more dense resulting in more @@ -465,7 +539,7 @@ struct { #define Py_EMPTY_KEYS &empty_keys_struct /* Uncomment to check the dict content in _PyDict_CheckConsistency() */ -/* #define DEBUG_PYDICT */ +// #define DEBUG_PYDICT #ifdef DEBUG_PYDICT # define ASSERT_CONSISTENT(op) assert(_PyDict_CheckConsistency((PyObject *)(op), 1)) @@ -481,14 +555,44 @@ get_index_from_order(PyDictObject *mp, Py_ssize_t i) return ((char *)mp->ma_values)[-3-i]; } +void +dump_dictkeys(PyDictKeysObject *dk) +{ + fprintf(stderr, "dict object %p\n", dk); + fprintf(stderr, " log2_size=%d\n", dk->dk_log2_size); + fprintf(stderr, " usable=%d\n", dk->dk_usable); + fprintf(stderr, " nentries=%d\n", dk->dk_nentries); + + if (dk->dk_log2_size == PyDict_LOG_MINSIZE) { + const uint8_t *index = (const uint8_t*) dk->dk_indices; + fprintf(stderr, " indices: %x %x %x %x %x %x %x %x\n", + index[0], index[1], index[2], index[3], + index[4], index[5], index[6], index[7]); + } + else { + fprintf(stderr, " indices:\n"); + int n = DK_SIZE(dk); + for (int i = 0; i < n; i++) { + fprintf(stderr, " - %d: %d\n", i, (int)dictkeys_get_index(dk, i)); + } + } + + PyDictKeyEntry *ep = DK_ENTRIES(dk); + Py_ssize_t usable = USABLE_FRACTION(DK_SIZE(dk)); + for (int i = 0; ime_key, ep->me_hash); + } + fprintf(stderr, "\n"); +} + int _PyDict_CheckConsistency(PyObject *op, int check_content) { #define CHECK(expr) \ - do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) + do { if (!(expr)) { dump_dictkeys(keys); _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) assert(op != NULL); - CHECK(PyDict_Check(op)); + //CHECK(PyDict_Check(op)); PyDictObject *mp = (PyDictObject *)op; PyDictKeysObject *keys = mp->ma_keys; @@ -511,15 +615,28 @@ _PyDict_CheckConsistency(PyObject *op, int check_content) if (check_content) { PyDictKeyEntry *entries = DK_ENTRIES(keys); - for (Py_ssize_t i=0; i < DK_SIZE(keys); i++) { - Py_ssize_t ix = dictkeys_get_index(keys, i); - CHECK(DKIX_DUMMY <= ix && ix <= usable); + if (!DK_ISVECTOR(keys)) { + for (Py_ssize_t i=0; i < DK_SIZE(keys); i++) { + Py_ssize_t ix = dictkeys_get_index(keys, i); + CHECK(DKIX_DUMMY <= ix && ix <= usable); + + if (ix >= 0) { + CHECK(entries[ix].me_key != NULL); + } + } } for (Py_ssize_t i=0; i < usable; i++) { PyDictKeyEntry *entry = &entries[i]; PyObject *key = entry->me_key; + if (DK_ISVECTOR(keys)) { + int h7 = dictkeys_get_index(keys, i); + if (entries[i].me_key) { + CHECK((entry->me_hash & 0x7f) == h7); + } + } + if (key != NULL) { if (PyUnicode_CheckExact(key)) { Py_hash_t hash = ((PyASCIIObject *)key)->hash; @@ -779,6 +896,7 @@ PyDict_New(void) static Py_ssize_t lookdict_index(PyDictKeysObject *k, Py_hash_t hash, Py_ssize_t index) { + assert(!DK_ISVECTOR(k)); size_t mask = DK_MASK(k); size_t perturb = (size_t)hash; size_t i = (size_t)hash & mask; @@ -797,9 +915,43 @@ lookdict_index(PyDictKeysObject *k, Py_hash_t hash, Py_ssize_t index) Py_UNREACHABLE(); } +static Py_ssize_t +dictkeys_stringlookup_vector(PyDictKeysObject* dk, PyObject *key, Py_hash_t hash) +{ + assert(DK_ISVECTOR(dk)); + const uint64_t indices = *(const uint64_t*)(dk->dk_indices); + uint64_t found = hasvalue(indices, (uint8_t)(hash & 0x7f)); + PyDictKeyEntry *ep0 = DK_ENTRIES_MIN(dk); + + while (found) { + // 80 byte means found. LSB first. + // Example: 00 00 00 80 00 00 80 00 00 -> 2, 5 + int pos = CountTrailingZeroesNonzero64(found); + assert((pos+1) % 8 == 0); + // 7 >> 3 = 0 + // (8+7) >> 3 = 1 + // (16+7) >> 3 = 2 + // ... + pos >>= 3; + + PyDictKeyEntry *ep = &ep0[pos]; + if (ep->me_key == key || + (ep->me_hash == hash && unicode_eq(ep->me_key, key))) { + return pos; + } + + // reset lowest 1 bit. + found &= (found - 1); + } + return DKIX_EMPTY; +} + static Py_ssize_t dictkeys_stringlookup(PyDictKeysObject* dk, PyObject *key, Py_hash_t hash) { + if (DK_ISVECTOR(dk)) { + return dictkeys_stringlookup_vector(dk, key, hash); + } PyDictKeyEntry *ep0 = DK_ENTRIES(dk); size_t mask = DK_MASK(dk); size_t perturb = hash; @@ -899,11 +1051,62 @@ _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **valu } return ix; } - PyDictKeyEntry *ep0 = DK_ENTRIES(dk); + + Py_ssize_t ix; + PyDictKeyEntry *ep0; + + if (DK_ISVECTOR(dk)) { + // Use vector instead of hash table. + const uint64_t indices = *(const uint64_t*)(dk->dk_indices); + // found: 80 byte means found. LSB first. + // Example: 00 00 00 80 00 00 80 00 00 means 2, 5 + uint64_t found = hasvalue(indices, (uint8_t)(hash & 0x7f)); + ep0 = DK_ENTRIES_MIN(dk); + + while (found) { + ix = CountTrailingZeroesNonzero64(found); + assert((ix+1) % 8 == 0); + // 7 >> 3 = 0 + // (8+7) >> 3 = 1 + // (16+7) >> 3 = 2 + // ... + ix >>= 3; + + PyDictKeyEntry *ep = &ep0[ix]; + if (ep->me_key == key) { + goto found; + } + if (ep->me_hash == hash) { + PyObject *startkey = ep->me_key; + Py_INCREF(startkey); + int cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); + Py_DECREF(startkey); + if (cmp < 0) { + *value_addr = NULL; + return DKIX_ERROR; + } + if (dk == mp->ma_keys && ep->me_key == startkey) { + if (cmp > 0) { + goto found; + } + } + else { + /* The dict was mutated, restart */ + goto start; + } + } + + // reset lowest 1 bit. + found &= (found - 1); + } + *value_addr = NULL; + return DKIX_EMPTY; + } + + ep0 = DK_ENTRIES(dk); size_t mask = DK_MASK(dk); size_t perturb = hash; size_t i = (size_t)hash & mask; - Py_ssize_t ix; for (;;) { ix = dictkeys_get_index(dk, i); if (ix == DKIX_EMPTY) { @@ -1019,6 +1222,7 @@ static Py_ssize_t find_empty_slot(PyDictKeysObject *keys, Py_hash_t hash) { assert(keys != NULL); + assert(!DK_ISVECTOR(keys)); const size_t mask = DK_MASK(keys); size_t i = hash & mask; @@ -1057,10 +1261,15 @@ insert_into_dictkeys(PyDictKeysObject *keys, PyObject *name) Py_INCREF(name); /* Insert into new slot. */ keys->dk_version = 0; - Py_ssize_t hashpos = find_empty_slot(keys, hash); ix = keys->dk_nentries; PyDictKeyEntry *ep = &DK_ENTRIES(keys)[ix]; - dictkeys_set_index(keys, hashpos, ix); + if (DK_ISVECTOR(keys)) { + dictkeys_set_index(keys, ix, hash & 0x7f); + } + else { + Py_ssize_t hashpos = find_empty_slot(keys, hash); + dictkeys_set_index(keys, hashpos, ix); + } assert(ep->me_key == NULL); ep->me_key = name; ep->me_hash = hash; @@ -1106,9 +1315,14 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value) if (!PyUnicode_CheckExact(key) && mp->ma_keys->dk_kind != DICT_KEYS_GENERAL) { mp->ma_keys->dk_kind = DICT_KEYS_GENERAL; } - Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash); + if (DK_ISVECTOR(mp->ma_keys)) { + dictkeys_set_index(mp->ma_keys, mp->ma_keys->dk_nentries, hash & 0x7f); + } + else { + Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash); + dictkeys_set_index(mp->ma_keys, hashpos, mp->ma_keys->dk_nentries); + } ep = &DK_ENTRIES(mp->ma_keys)[mp->ma_keys->dk_nentries]; - dictkeys_set_index(mp->ma_keys, hashpos, mp->ma_keys->dk_nentries); ep->me_key = key; ep->me_hash = hash; if (mp->ma_values) { @@ -1177,9 +1391,8 @@ insert_to_emptydict(PyDictObject *mp, PyObject *key, Py_hash_t hash, MAINTAIN_TRACKING(mp, key, value); - size_t hashpos = (size_t)hash & (PyDict_MINSIZE-1); - PyDictKeyEntry *ep = DK_ENTRIES(mp->ma_keys); - dictkeys_set_index(mp->ma_keys, hashpos, 0); + PyDictKeyEntry *ep = DK_ENTRIES_MIN(mp->ma_keys); + dictkeys_set_index(mp->ma_keys, 0, hash&0x7f); ep->me_key = key; ep->me_hash = hash; ep->me_value = value; @@ -1196,6 +1409,13 @@ Internal routine used by dictresize() to build a hashtable of entries. static void build_indices(PyDictKeysObject *keys, PyDictKeyEntry *ep, Py_ssize_t n) { + if (DK_ISVECTOR(keys)) { + for (Py_ssize_t ix = 0; ix < n; ix++, ep++) { + Py_hash_t hash = ep->me_hash; + dictkeys_set_index(keys, ix, hash & 0x7f); + } + return; + } size_t mask = DK_MASK(keys); for (Py_ssize_t ix = 0; ix != n; ix++, ep++) { Py_hash_t hash = ep->me_hash; @@ -1650,10 +1870,6 @@ delitem_common(PyDictObject *mp, Py_hash_t hash, Py_ssize_t ix, PyObject *old_key; PyDictKeyEntry *ep; - Py_ssize_t hashpos = lookdict_index(mp->ma_keys, hash, ix); - assert(hashpos >= 0); - - mp->ma_used--; mp->ma_version_tag = DICT_NEXT_VERSION(); ep = &DK_ENTRIES(mp->ma_keys)[ix]; if (mp->ma_values) { @@ -1666,13 +1882,25 @@ delitem_common(PyDictObject *mp, Py_hash_t hash, Py_ssize_t ix, } else { mp->ma_keys->dk_version = 0; - dictkeys_set_index(mp->ma_keys, hashpos, DKIX_DUMMY); + if (DK_ISVECTOR(mp->ma_keys)) { + dictkeys_set_index(mp->ma_keys, ix, DKIX_EMPTY); + if (ix+1 == mp->ma_keys->dk_nentries) { + mp->ma_keys->dk_nentries--; + mp->ma_keys->dk_usable++; + } + } + else { + Py_ssize_t hashpos = lookdict_index(mp->ma_keys, hash, ix); + assert(hashpos >= 0); + dictkeys_set_index(mp->ma_keys, hashpos, DKIX_DUMMY); + } old_key = ep->me_key; ep->me_key = NULL; ep->me_value = NULL; Py_DECREF(old_key); } Py_DECREF(old_value); + mp->ma_used--; ASSERT_CONSISTENT(mp); return 0; @@ -1753,8 +1981,13 @@ _PyDict_DelItemIf(PyObject *op, PyObject *key, if (res == -1) return -1; - hashpos = lookdict_index(mp->ma_keys, hash, ix); - assert(hashpos >= 0); + if (DK_ISVECTOR(mp->ma_keys)) { + hashpos = ix; + } + else { + hashpos = lookdict_index(mp->ma_keys, hash, ix); + assert(hashpos >= 0); + } if (res > 0) return delitem_common(mp, hashpos, ix, old_value); @@ -3050,10 +3283,15 @@ PyDict_SetDefault(PyObject *d, PyObject *key, PyObject *defaultobj) if (!PyUnicode_CheckExact(key) && mp->ma_keys->dk_kind != DICT_KEYS_GENERAL) { mp->ma_keys->dk_kind = DICT_KEYS_GENERAL; } - Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash); + if (DK_ISVECTOR(mp->ma_keys)) { + dictkeys_set_index(mp->ma_keys, mp->ma_keys->dk_nentries, hash&0x7f); + } + else { + Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash); + dictkeys_set_index(mp->ma_keys, hashpos, mp->ma_keys->dk_nentries); + } ep0 = DK_ENTRIES(mp->ma_keys); ep = &ep0[mp->ma_keys->dk_nentries]; - dictkeys_set_index(mp->ma_keys, hashpos, mp->ma_keys->dk_nentries); Py_INCREF(key); Py_INCREF(value); MAINTAIN_TRACKING(mp, key, value); @@ -3194,10 +3432,18 @@ dict_popitem_impl(PyDictObject *self) assert(i >= 0); ep = &ep0[i]; - j = lookdict_index(self->ma_keys, ep->me_hash, i); - assert(j >= 0); - assert(dictkeys_get_index(self->ma_keys, j) == i); - dictkeys_set_index(self->ma_keys, j, DKIX_DUMMY); + if (DK_ISVECTOR(self->ma_keys)) { + dictkeys_set_index(self->ma_keys, i, DKIX_EMPTY); + if (i == self->ma_keys->dk_nentries - 1) { + self->ma_keys->dk_usable++; + } + } + else { + j = lookdict_index(self->ma_keys, ep->me_hash, i); + assert(j >= 0); + assert(dictkeys_get_index(self->ma_keys, j) == i); + dictkeys_set_index(self->ma_keys, j, DKIX_DUMMY); + } PyTuple_SET_ITEM(res, 0, ep->me_key); PyTuple_SET_ITEM(res, 1, ep->me_value); From ff2c5b965647c8a8ed0a4e5d5a93232e8a6465a0 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 6 Feb 2022 13:10:50 +0900 Subject: [PATCH 2/6] Use SSE2 --- Objects/dictobject.c | 50 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 166f6a8f32e9e4..228cf78644d1bd 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -122,6 +122,16 @@ As a consequence of this, split keys have a maximum size of 16. #include "pycore_pystate.h" // _PyThreadState_GET() #include "stringlib/eq.h" // unicode_eq() + +#if defined(__SSE2__) || \ + (defined(_MSC_VER) && (defined(_M_X64) || (defined(_M_IX86)))) +#define HAVE_SSE2 1 +#include +#include +#else +#define HAVE_SSE2 0 +#endif + /*[clinic input] class dict "PyDictObject *" "&PyDict_Type" [clinic start generated code]*/ @@ -560,8 +570,8 @@ dump_dictkeys(PyDictKeysObject *dk) { fprintf(stderr, "dict object %p\n", dk); fprintf(stderr, " log2_size=%d\n", dk->dk_log2_size); - fprintf(stderr, " usable=%d\n", dk->dk_usable); - fprintf(stderr, " nentries=%d\n", dk->dk_nentries); + fprintf(stderr, " usable=%ld\n", dk->dk_usable); + fprintf(stderr, " nentries=%ld\n", dk->dk_nentries); if (dk->dk_log2_size == PyDict_LOG_MINSIZE) { const uint8_t *index = (const uint8_t*) dk->dk_indices; @@ -580,7 +590,7 @@ dump_dictkeys(PyDictKeysObject *dk) PyDictKeyEntry *ep = DK_ENTRIES(dk); Py_ssize_t usable = USABLE_FRACTION(DK_SIZE(dk)); for (int i = 0; ime_key, ep->me_hash); + fprintf(stderr, " %d: key=%p hash=%lx\n", i, ep->me_key, ep->me_hash); } fprintf(stderr, "\n"); } @@ -919,24 +929,36 @@ static Py_ssize_t dictkeys_stringlookup_vector(PyDictKeysObject* dk, PyObject *key, Py_hash_t hash) { assert(DK_ISVECTOR(dk)); + PyDictKeyEntry *ep0 = DK_ENTRIES_MIN(dk); const uint64_t indices = *(const uint64_t*)(dk->dk_indices); + +#if HAVE_SSE2 + uint64_t found = _mm_movemask_pi8( + _mm_cmpeq_pi8(_mm_set1_pi8(hash & 0x7f), _m_from_int64((int64_t)indices))); +#else uint64_t found = hasvalue(indices, (uint8_t)(hash & 0x7f)); - PyDictKeyEntry *ep0 = DK_ENTRIES_MIN(dk); +#endif while (found) { // 80 byte means found. LSB first. // Example: 00 00 00 80 00 00 80 00 00 -> 2, 5 int pos = CountTrailingZeroesNonzero64(found); +#if !HAVE_SSE2 assert((pos+1) % 8 == 0); // 7 >> 3 = 0 // (8+7) >> 3 = 1 // (16+7) >> 3 = 2 // ... pos >>= 3; +#endif PyDictKeyEntry *ep = &ep0[pos]; if (ep->me_key == key || - (ep->me_hash == hash && unicode_eq(ep->me_key, key))) { + (ep->me_hash == hash +#if !HAVE_SSE2 /* hasvalue() has rare false positive. */ + && ep->me_key != NULL +#endif + && unicode_eq(ep->me_key, key))) { return pos; } @@ -1056,27 +1078,37 @@ _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **valu PyDictKeyEntry *ep0; if (DK_ISVECTOR(dk)) { + ep0 = DK_ENTRIES_MIN(dk); // Use vector instead of hash table. const uint64_t indices = *(const uint64_t*)(dk->dk_indices); - // found: 80 byte means found. LSB first. - // Example: 00 00 00 80 00 00 80 00 00 means 2, 5 + +#if HAVE_SSE2 + uint64_t found = _mm_movemask_pi8( + _mm_cmpeq_pi8(_mm_set1_pi8(hash & 0x7f), _m_from_int64((int64_t)indices))); +#else uint64_t found = hasvalue(indices, (uint8_t)(hash & 0x7f)); - ep0 = DK_ENTRIES_MIN(dk); +#endif while (found) { ix = CountTrailingZeroesNonzero64(found); +#if !HAVE_SSE2 assert((ix+1) % 8 == 0); // 7 >> 3 = 0 // (8+7) >> 3 = 1 // (16+7) >> 3 = 2 // ... ix >>= 3; +#endif PyDictKeyEntry *ep = &ep0[ix]; if (ep->me_key == key) { goto found; } - if (ep->me_hash == hash) { + if (ep->me_hash == hash +#if !HAVE_SSE2 /* hasvalue() has rare false positive. */ + && ep->me_key != NULL +#endif + ) { PyObject *startkey = ep->me_key; Py_INCREF(startkey); int cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); From 8caccabd92363bedfbca099155663c6be9225c12 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 6 Feb 2022 18:07:53 +0900 Subject: [PATCH 3/6] Use 8 entries --- Objects/dictobject.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 228cf78644d1bd..be7e24eeb3edf9 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -473,7 +473,11 @@ CountTrailingZeroesNonzero64(uint64_t x) { * USABLE_FRACTION should be quick to calculate. * Fractions around 1/2 to 2/3 seem to work well in practice. */ -#define USABLE_FRACTION(n) (((n) << 1)/3) +static inline Py_ssize_t +USABLE_FRACTION(Py_ssize_t n) { + if (n == 8) return 8; + return n * 2 / 3; +} /* Find the smallest dk_size >= minsize. */ static inline uint8_t @@ -568,11 +572,13 @@ get_index_from_order(PyDictObject *mp, Py_ssize_t i) void dump_dictkeys(PyDictKeysObject *dk) { - fprintf(stderr, "dict object %p\n", dk); + fprintf(stderr, "dictkeys object %p\n", dk); + fprintf(stderr, " kind=%d\n", dk->dk_kind); fprintf(stderr, " log2_size=%d\n", dk->dk_log2_size); fprintf(stderr, " usable=%ld\n", dk->dk_usable); fprintf(stderr, " nentries=%ld\n", dk->dk_nentries); + Py_ssize_t usable = USABLE_FRACTION(DK_SIZE(dk)); if (dk->dk_log2_size == PyDict_LOG_MINSIZE) { const uint8_t *index = (const uint8_t*) dk->dk_indices; fprintf(stderr, " indices: %x %x %x %x %x %x %x %x\n", @@ -588,7 +594,6 @@ dump_dictkeys(PyDictKeysObject *dk) } PyDictKeyEntry *ep = DK_ENTRIES(dk); - Py_ssize_t usable = USABLE_FRACTION(DK_SIZE(dk)); for (int i = 0; ime_key, ep->me_hash); } @@ -599,7 +604,7 @@ int _PyDict_CheckConsistency(PyObject *op, int check_content) { #define CHECK(expr) \ - do { if (!(expr)) { dump_dictkeys(keys); _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) + do { if (!(expr)) { /* dump_dictkeys(keys); */ _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) assert(op != NULL); //CHECK(PyDict_Check(op)); From c4e91b9967f779221e6345c8557a26da967f258e Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 13 Feb 2022 18:00:25 +0900 Subject: [PATCH 4/6] Use 16-wide vector when SSE2 is available --- Include/internal/pycore_dict.h | 2 +- Objects/dictobject.c | 47 ++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h index 1b0c22a7fc5ab1..f7d68a2fdcc51c 100644 --- a/Include/internal/pycore_dict.h +++ b/Include/internal/pycore_dict.h @@ -131,7 +131,7 @@ struct _dictvalues { #define DK_ENTRIES(dk) \ ((PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[DK_SIZE(dk) * DK_IXSIZE(dk)])) #define DK_ENTRIES_MIN(dk) \ - ((PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[8])) + ((PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[DK_SIZE(dk)])) extern uint64_t _pydict_global_version; diff --git a/Objects/dictobject.c b/Objects/dictobject.c index be7e24eeb3edf9..de8dc8801a7c44 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -250,7 +250,13 @@ uint64_t _pydict_global_version = 0; #include "clinic/dictobject.c.h" -#define DK_ISVECTOR(dk) ((dk)->dk_log2_size == PyDict_LOG_MINSIZE) +#if HAVE_SSE2 +// Use vector16 when SSE2 is available +#define DK_ISVECTOR(dk) ((dk)->dk_log2_size <= 4) +#else +// Use vector8 otherwise. +#define DK_ISVECTOR(dk) ((dk)->dk_log2_size == 3) +#endif #if PyDict_MAXFREELIST > 0 static struct _Py_dict_state * @@ -475,7 +481,11 @@ CountTrailingZeroesNonzero64(uint64_t x) { */ static inline Py_ssize_t USABLE_FRACTION(Py_ssize_t n) { +#if HAVE_SSE2 + if (n <= 16) return n; +#else if (n == 8) return 8; +#endif return n * 2 / 3; } @@ -531,7 +541,7 @@ estimate_log2_keysize(Py_ssize_t n) */ static PyDictKeysObject empty_keys_struct = { 1, /* dk_refcnt */ - 0, /* dk_log2_size */ + 3, /* dk_log2_size */ DICT_KEYS_SPLIT, /* dk_kind */ 1, /* dk_version */ 0, /* dk_usable (immutable) */ @@ -569,7 +579,7 @@ get_index_from_order(PyDictObject *mp, Py_ssize_t i) return ((char *)mp->ma_values)[-3-i]; } -void +static void dump_dictkeys(PyDictKeysObject *dk) { fprintf(stderr, "dictkeys object %p\n", dk); @@ -578,12 +588,17 @@ dump_dictkeys(PyDictKeysObject *dk) fprintf(stderr, " usable=%ld\n", dk->dk_usable); fprintf(stderr, " nentries=%ld\n", dk->dk_nentries); - Py_ssize_t usable = USABLE_FRACTION(DK_SIZE(dk)); - if (dk->dk_log2_size == PyDict_LOG_MINSIZE) { + if (DK_ISVECTOR(dk)) { const uint8_t *index = (const uint8_t*) dk->dk_indices; - fprintf(stderr, " indices: %x %x %x %x %x %x %x %x\n", + fprintf(stderr, " indices: %2x %2x %2x %2x %2x %2x %2x %2x\n", index[0], index[1], index[2], index[3], index[4], index[5], index[6], index[7]); + if (dk->dk_log2_size == 16) { + index += 8; + fprintf(stderr, " %2x %2x %2x %2x %2x %2x %2x %2x\n", + index[0], index[1], index[2], index[3], + index[4], index[5], index[6], index[7]); + } } else { fprintf(stderr, " indices:\n"); @@ -594,7 +609,7 @@ dump_dictkeys(PyDictKeysObject *dk) } PyDictKeyEntry *ep = DK_ENTRIES(dk); - for (int i = 0; idk_nentries; i++, ep++) { fprintf(stderr, " %d: key=%p hash=%lx\n", i, ep->me_key, ep->me_hash); } fprintf(stderr, "\n"); @@ -604,10 +619,9 @@ int _PyDict_CheckConsistency(PyObject *op, int check_content) { #define CHECK(expr) \ - do { if (!(expr)) { /* dump_dictkeys(keys); */ _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) + do { if (!(expr)) { dump_dictkeys(keys); _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) assert(op != NULL); - //CHECK(PyDict_Check(op)); PyDictObject *mp = (PyDictObject *)op; PyDictKeysObject *keys = mp->ma_keys; @@ -938,8 +952,11 @@ dictkeys_stringlookup_vector(PyDictKeysObject* dk, PyObject *key, Py_hash_t hash const uint64_t indices = *(const uint64_t*)(dk->dk_indices); #if HAVE_SSE2 - uint64_t found = _mm_movemask_pi8( - _mm_cmpeq_pi8(_mm_set1_pi8(hash & 0x7f), _m_from_int64((int64_t)indices))); + // vector16 is supported only with SSE2 + uint64_t hi = dk->dk_log2_size == 3 ? -1ULL : ((const uint64_t*)(dk->dk_indices))[1]; + uint64_t found = _mm_movemask_epi8( + _mm_cmpeq_epi8(_mm_set1_epi8(hash & 0x7f), _mm_set_epi64x(hi, indices))); + //fprintf(stderr, "string hi=%llx lo=%llx hash=%x found=%lx\n", hi, indices, (int)hash&0x7f, found); #else uint64_t found = hasvalue(indices, (uint8_t)(hash & 0x7f)); #endif @@ -1088,8 +1105,10 @@ _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **valu const uint64_t indices = *(const uint64_t*)(dk->dk_indices); #if HAVE_SSE2 - uint64_t found = _mm_movemask_pi8( - _mm_cmpeq_pi8(_mm_set1_pi8(hash & 0x7f), _m_from_int64((int64_t)indices))); + uint64_t hi = dk->dk_log2_size == 3 ? -1ULL : ((const uint64_t*)(dk->dk_indices))[1]; + uint64_t found = _mm_movemask_epi8( + _mm_cmpeq_epi8(_mm_set1_epi8(hash & 0x7f), _mm_set_epi64x(hi, indices))); + //fprintf(stderr, "general log2=%d hi=%llx lo=%llx hash=%x found=%lx\n", dk->dk_log2_size, hi, indices, (int)hash&0x7f, found); #else uint64_t found = hasvalue(indices, (uint8_t)(hash & 0x7f)); #endif @@ -1123,6 +1142,8 @@ _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **valu return DKIX_ERROR; } if (dk == mp->ma_keys && ep->me_key == startkey) { + assert(ep->me_value->ob_type != NULL); + assert(ep->me_value->ob_type->tp_name != NULL); if (cmp > 0) { goto found; } From 5bdc8935b8d1be454ae89a09e9f1502f10b710a4 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 13 Feb 2022 21:07:54 +0900 Subject: [PATCH 5/6] fix estimate_log2_keysize --- Objects/dictobject.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Objects/dictobject.c b/Objects/dictobject.c index de8dc8801a7c44..250f2893cf9542 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -520,6 +520,10 @@ calculate_log2_keysize(Py_ssize_t minsize) static inline uint8_t estimate_log2_keysize(Py_ssize_t n) { + if (n <= 8) return 3; +#if HAVE_SSE2 + if (n <= 16) return 4; +#endif return calculate_log2_keysize((n*3 + 1) / 2); } From 0c56f2f88c1c6a55cdcc4a9bfb8e17a5f401b656 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Wed, 23 Feb 2022 12:53:53 +0900 Subject: [PATCH 6/6] refactoring --- Objects/dictobject.c | 278 ++++++++++++++++++++++++------------------- 1 file changed, 156 insertions(+), 122 deletions(-) diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 250f2893cf9542..b01093d83bd319 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -132,6 +132,135 @@ As a consequence of this, split keys have a maximum size of 16. #define HAVE_SSE2 0 #endif +/**** bit utilities. This will be moved to pycore_bitutils.h ****/ + +/* CountTrailingZeroesNonzero64 + +This function is copied from: +https://github.com/abseil/abseil-cpp/blob/1ae9b71c474628d60eb251a3f62967fe64151bb2/absl/numeric/internal/bits.h#L273 + +License of this function: +// Copyright 2020 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +Modifications by Inada Naoki: + +* Port to C, remove some macros. + +*/ +static inline int +CountTrailingZeroesNonzero64(uint64_t x) { +#if (defined(__clang__) || defined(__GNUC__)) + return __builtin_ctzll(x); +#elif defined(_MSC_VER) && !defined(__clang__) && \ + (defined(_M_X64) || defined(_M_ARM64)) + unsigned long result = 0; + _BitScanForward64(&result, x); + return result; +#elif defined(_MSC_VER) && !defined(__clang__) + unsigned long result = 0; + if ((uint32_t)(x) == 0) { + _BitScanForward(&result, (uint32_t)(x >> 32)); + return result + 32; + } + _BitScanForward(&result, (unsigned long)(x)); + return result; +#else + int c = 63; + x &= ~x + 1; + if (x & 0x00000000FFFFFFFF) c -= 32; + if (x & 0x0000FFFF0000FFFF) c -= 16; + if (x & 0x00FF00FF00FF00FF) c -= 8; + if (x & 0x0F0F0F0F0F0F0F0F) c -= 4; + if (x & 0x3333333333333333) c -= 2; + if (x & 0x5555555555555555) c -= 1; + return c; +#endif +} + +#if HAVE_SSE2 + +#ifdef _MSC_VER +static __inline __m64 +_mm_set_pi64x (const __int64 i) { + union { + __int64 i; + __m64 v; + } u; + + u.i = i; + return u.v; +} +#endif + +static inline uint64_t +match_byte(uint64_t x, uint8_t n) { + return _mm_movemask_pi8(_mm_cmpeq_pi8(_mm_set1_pi8(n), _mm_set_pi64x(x))); +} + +static inline uint64_t +match_byte16(uint64_t hi, uint64_t lo, char c) { + __m128i control = _mm_set_epi64x(hi, lo); + return _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_set1_epi8(c), control)); +} + +static inline int +bitmask_getpos(uint64_t mask) { + return CountTrailingZeroesNonzero64(mask); +} + +#else /* HAVE_SSE2 */ + +// 8x8 simd-like functions. +// https://graphics.stanford.edu/~seander/bithacks.html +static const uint64_t lsb = 0x0101010101010101ull; +static const uint64_t msb = 0x8080808080808080ull; + +// 0x00 -> 0x80, otherwise -> 0x00. +// Caution: This function has false positive in limited case. +static inline uint64_t +haszero(uint64_t v) { + return (((v) - lsb) & ~(v) & msb); +} + +// Find n in x. n should be 7bit value. +static inline uint64_t +match_byte(uint64_t x, uint8_t n) { + return haszero(x ^ (n*lsb)); +} + +static inline int +bitmask_getpos(uint64_t mask) { + // 80 byte means found. LSB first. + // Example: 00 00 00 80 00 00 80 00 00 -> 2, 5 + int pos = CountTrailingZeroesNonzero64(mask); + + // Adjust position by >>3: + // (0+7) >> 3 = 0 + // (8+7) >> 3 = 1 + // (16+7) >> 3 = 2 + return pos >> 3; +} + +#endif + +static inline void +bitmask_next(uint64_t *found) { + // reset lowest 1bit + *found &= (*found - 1); +} + /*[clinic input] class dict "PyDictObject *" "&PyDict_Type" [clinic start generated code]*/ @@ -394,80 +523,6 @@ dictkeys_set_index(PyDictKeysObject *keys, Py_ssize_t i, Py_ssize_t ix) } } -// 8x8 simd-like functions. -// https://graphics.stanford.edu/~seander/bithacks.html - -static const uint64_t lsb = 0x0101010101010101ull; -static const uint64_t msb = 0x8080808080808080ull; - -// 0x00 -> 0x80, otherwise -> 0x00. -// Caution: This function has false positive in limited case. -static inline uint64_t -haszero(uint64_t v) { - return (((v) - lsb) & ~(v) & msb); -} - -// Find n in x. n should be 7bit value. -static inline uint64_t -hasvalue(uint64_t x, uint8_t n) { - return haszero(x ^ (n*lsb)); -} - -/* CountTrailingZeroesNonzero64 - -This function is copied from: -https://github.com/abseil/abseil-cpp/blob/1ae9b71c474628d60eb251a3f62967fe64151bb2/absl/numeric/internal/bits.h#L273 - -License of this function: -// Copyright 2020 The Abseil Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -Modifications: - -* Port to C, remove some macros. - -*/ -static inline int -CountTrailingZeroesNonzero64(uint64_t x) { -#if (defined(__clang__) || defined(__GNUC__)) - return __builtin_ctzll(x); -#elif defined(_MSC_VER) && !defined(__clang__) && \ - (defined(_M_X64) || defined(_M_ARM64)) - unsigned long result = 0; - _BitScanForward64(&result, x); - return result; -#elif defined(_MSC_VER) && !defined(__clang__) - unsigned long result = 0; - if ((uint32_t)(x) == 0) { - _BitScanForward(&result, (uint32_t)(x >> 32)); - return result + 32; - } - _BitScanForward(&result, (unsigned long)(x)); - return result; -#else - int c = 63; - x &= ~x + 1; - if (x & 0x00000000FFFFFFFF) c -= 32; - if (x & 0x0000FFFF0000FFFF) c -= 16; - if (x & 0x00FF00FF00FF00FF) c -= 8; - if (x & 0x0F0F0F0F0F0F0F0F) c -= 4; - if (x & 0x3333333333333333) c -= 2; - if (x & 0x5555555555555555) c -= 1; - return c; -#endif -} - /* USABLE_FRACTION is the maximum dictionary load. * Increasing this ratio makes dictionaries more dense resulting in more * collisions. Decreasing it improves sparseness at the expense of spreading @@ -953,43 +1008,32 @@ dictkeys_stringlookup_vector(PyDictKeysObject* dk, PyObject *key, Py_hash_t hash { assert(DK_ISVECTOR(dk)); PyDictKeyEntry *ep0 = DK_ENTRIES_MIN(dk); - const uint64_t indices = *(const uint64_t*)(dk->dk_indices); + const uint64_t *indices = (const uint64_t*)(dk->dk_indices); + uint8_t h1 = (uint8_t)(hash & 0x7f); + uint64_t found; #if HAVE_SSE2 // vector16 is supported only with SSE2 - uint64_t hi = dk->dk_log2_size == 3 ? -1ULL : ((const uint64_t*)(dk->dk_indices))[1]; - uint64_t found = _mm_movemask_epi8( - _mm_cmpeq_epi8(_mm_set1_epi8(hash & 0x7f), _mm_set_epi64x(hi, indices))); - //fprintf(stderr, "string hi=%llx lo=%llx hash=%x found=%lx\n", hi, indices, (int)hash&0x7f, found); + if (dk->dk_log2_size == 3) { + found = match_byte(indices[0], h1); + } + else { + found = match_byte16(indices[1], indices[0], h1); + } #else - uint64_t found = hasvalue(indices, (uint8_t)(hash & 0x7f)); + found = match_byte(indices[0], h1); #endif - while (found) { + for (; found; bitmask_next(&found)) { // 80 byte means found. LSB first. // Example: 00 00 00 80 00 00 80 00 00 -> 2, 5 - int pos = CountTrailingZeroesNonzero64(found); -#if !HAVE_SSE2 - assert((pos+1) % 8 == 0); - // 7 >> 3 = 0 - // (8+7) >> 3 = 1 - // (16+7) >> 3 = 2 - // ... - pos >>= 3; -#endif + int pos = bitmask_getpos(found); PyDictKeyEntry *ep = &ep0[pos]; if (ep->me_key == key || - (ep->me_hash == hash -#if !HAVE_SSE2 /* hasvalue() has rare false positive. */ - && ep->me_key != NULL -#endif - && unicode_eq(ep->me_key, key))) { + (ep->me_hash == hash && unicode_eq(ep->me_key, key))) { return pos; } - - // reset lowest 1 bit. - found &= (found - 1); } return DKIX_EMPTY; } @@ -1106,37 +1150,30 @@ _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **valu if (DK_ISVECTOR(dk)) { ep0 = DK_ENTRIES_MIN(dk); // Use vector instead of hash table. - const uint64_t indices = *(const uint64_t*)(dk->dk_indices); + const uint64_t *indices = (const uint64_t*)(dk->dk_indices); + uint8_t h1 = (uint8_t)(hash & 0x7f); + uint64_t found; #if HAVE_SSE2 - uint64_t hi = dk->dk_log2_size == 3 ? -1ULL : ((const uint64_t*)(dk->dk_indices))[1]; - uint64_t found = _mm_movemask_epi8( - _mm_cmpeq_epi8(_mm_set1_epi8(hash & 0x7f), _mm_set_epi64x(hi, indices))); - //fprintf(stderr, "general log2=%d hi=%llx lo=%llx hash=%x found=%lx\n", dk->dk_log2_size, hi, indices, (int)hash&0x7f, found); + // vector16 is supported only with SSE2 + if (dk->dk_log2_size == 3) { + found = match_byte(indices[0], h1); + } + else { + found = match_byte16(indices[1], indices[0], h1); + } #else - uint64_t found = hasvalue(indices, (uint8_t)(hash & 0x7f)); + found = match_byte(indices[0], h1); #endif - while (found) { - ix = CountTrailingZeroesNonzero64(found); -#if !HAVE_SSE2 - assert((ix+1) % 8 == 0); - // 7 >> 3 = 0 - // (8+7) >> 3 = 1 - // (16+7) >> 3 = 2 - // ... - ix >>= 3; -#endif + for (; found; bitmask_next(&found)) { + ix = bitmask_getpos(found); PyDictKeyEntry *ep = &ep0[ix]; if (ep->me_key == key) { goto found; } - if (ep->me_hash == hash -#if !HAVE_SSE2 /* hasvalue() has rare false positive. */ - && ep->me_key != NULL -#endif - ) { + if (ep->me_hash == hash) { PyObject *startkey = ep->me_key; Py_INCREF(startkey); int cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); @@ -1157,9 +1194,6 @@ _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **valu goto start; } } - - // reset lowest 1 bit. - found &= (found - 1); } *value_addr = NULL; return DKIX_EMPTY;