Skip to content

Commit 840ac79

Browse files
committed
Further improve performance by fixing specialized data accessors
After the previous speed-up, almost 35% of the perf flamegraph for decoding was caught up in std::string::push_back(). That seemed suspicious, because I thought that initial allocation with reserve() should make the remaining push_back() calls very fast. Turns out that's not the case. Along the way, I found out that the optimized implementation for containers with mutable data() access (e.g. std::vector<uint8_t>) did not actually work. What I believed I had gotten to work was in fact silently removed by SFINAE; as a result, all allocation went through the fallback path with reserve() and push_back(). The fix required some changes but in the end isn't all that bad. In addition to the handler for containers with mutable data(), I added a handler for containers with mutable operator[] such as std::string (as long as it's possible to assign a char to it). To make sure it now works as intended, a static_assert() now ensures that the correct code path is being used. The benchmark from gaspardpetit/base64 uses std::string for both encoding and decoding, which is not so great but I guess makes it easier to integrate all the ad-hoc libraries with std::string APIs. Consequently, cppcodec now moves to the upper league in both encoding and decoding benchmarks. On my system for the 256 buffer size benchmark, encoding was at 1.37 before and is now at 0.80, or 40% less time spent compared to the previous commit. Decoding got faster from an original 1.68 to about 0.85, i.e. 50% less time spent. This puts cppcodec performance slightly behind the GNOME base64 implementation for encoding and in the neighborhood of nehadamvr/arduino-base64, way ahead of ElegantDice and such. For decoding, cppcodec now beats GNOME and catches up to Wikibooks.org/C, slightly behind Apache. Polfosol is still way ahead of cppcodec for both encoding and decoding.
1 parent 34b4f92 commit 840ac79

File tree

1 file changed

+159
-31
lines changed

1 file changed

+159
-31
lines changed

cppcodec/data/access.hpp

Lines changed: 159 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
#define CPPCODEC_DETAIL_DATA_ACCESS
2626

2727
#include <stdint.h> // for size_t
28+
#include <string> // for static_assert() checking that string will be optimized
29+
#include <type_traits> // for std::enable_if and such
30+
#include <vector> // for static_assert() checking that vector will be optimized
31+
32+
#include "../detail/config.hpp" // for CPPCODEC_ALWAYS_INLINE
2833

2934
namespace cppcodec {
3035
namespace data {
@@ -37,36 +42,43 @@ namespace data {
3742
// For const (read-only) types: char_data(const T&)
3843
// For both const and result types: size(const T&)
3944

40-
template <typename T> inline size_t size(const T& t) { return t.size(); }
41-
template <typename T, size_t N> inline constexpr size_t size(const T (&t)[N]) noexcept {
45+
template <typename T>
46+
CPPCODEC_ALWAYS_INLINE size_t size(const T& t) { return t.size(); }
47+
48+
template <typename T, size_t N>
49+
CPPCODEC_ALWAYS_INLINE constexpr size_t size(const T (&t)[N]) noexcept {
4250
return N * sizeof(t[0]);
4351
}
4452

4553
class general_t {};
4654
class specific_t : public general_t {};
4755

4856
class empty_result_state {
49-
template <typename Result> inline void size(const Result& result) { return size(result); }
57+
template <typename Result>
58+
CPPCODEC_ALWAYS_INLINE void size(const Result& result) { return size(result); }
5059
};
5160

5261
// SFINAE: Generic fallback in case no specific state function applies.
5362
template <typename Result>
54-
inline empty_result_state create_state(Result&, general_t) { return empty_result_state(); }
63+
CPPCODEC_ALWAYS_INLINE empty_result_state create_state(Result&, general_t)
64+
{
65+
return empty_result_state();
66+
}
5567

5668
//
5769
// Generic templates for containers: Use these init()/put()/finish()
5870
// implementations if no specialization was found.
5971
//
6072

6173
template <typename Result>
62-
inline void init(Result& result, empty_result_state&, size_t capacity)
74+
CPPCODEC_ALWAYS_INLINE void init(Result& result, empty_result_state&, size_t capacity)
6375
{
6476
result.resize(0);
6577
result.reserve(capacity);
6678
}
6779

6880
template <typename Result>
69-
inline void finish(Result&, empty_result_state&)
81+
CPPCODEC_ALWAYS_INLINE void finish(Result&, empty_result_state&)
7082
{
7183
// Default is to push_back(), which already increases the size.
7284
}
@@ -86,52 +98,68 @@ template <typename Result> inline void put_uint8(Result& result, uint8_t c) { re
8698

8799
template <bool> struct put_impl;
88100
template <> struct put_impl<true> { // put_uint8() available
89-
template<typename Result> static void put(Result& result, uint8_t c) { put_uint8(result, c); }
101+
template<typename Result>
102+
static CPPCODEC_ALWAYS_INLINE void put(Result& result, uint8_t c)
103+
{
104+
put_uint8(result, c);
105+
}
90106
};
91107
template <> struct put_impl<false> { // put_uint8() not available
92-
template<typename Result> static void put(Result& result, uint8_t c) {
108+
template<typename Result>
109+
static CPPCODEC_ALWAYS_INLINE void put(Result& result, uint8_t c)
110+
{
93111
result.push_back(static_cast<char>(c));
94112
}
95113
};
96114

97-
template <typename Result> inline void put(Result& result, empty_result_state&, uint8_t c)
115+
template <typename Result>
116+
CPPCODEC_ALWAYS_INLINE void put(Result& result, empty_result_state&, uint8_t c)
98117
{
99118
using namespace fallback;
100119
put_impl<sizeof(fallback::flag(), put_uint8(result, c), fallback::flag()) != 1>::put(result, c);
101120
}
102121

103122
//
104-
// Specialization for container types with direct mutable data access.
105-
// The expected way to specialize is to subclass empty_result_state and
123+
// Specialization for container types with direct mutable data access,
124+
// e.g. std::vector<uint8_t>.
125+
//
126+
// The expected way to specialize is to draft a new xyz_result_state type and
106127
// return an instance of it from a create_state() template specialization.
107128
// You can then create overloads for init(), put() and finish()
108-
// that are more specific than the empty_result_state ones above.
109-
// See the example below for direct access to a mutable data() method.
129+
// for the new result state type.
110130
//
111131
// If desired, a non-templated overload for both specific types
112132
// (result & state) can be added to tailor it to that particular result type.
113133
//
114134

115-
template <typename Result> class direct_data_access_result_state : empty_result_state
135+
template <typename T>
136+
constexpr auto data_is_mutable(T* t) -> decltype(t->data()[size_t(0)] = 'x', bool())
116137
{
117-
public:
118-
using result_type = Result;
138+
return true;
139+
}
140+
constexpr bool data_is_mutable(...) { return false; }
119141

120-
inline void init(Result& result, size_t capacity)
142+
template <typename Result>
143+
class direct_data_access_result_state
144+
{
145+
public:
146+
CPPCODEC_ALWAYS_INLINE void init(Result& result, size_t capacity)
121147
{
122-
// resize(0) is not called here since we don't rely on it
123-
result.reserve(capacity);
148+
// reserve() may not actually allocate the storage right away,
149+
// and it isn't guaranteed that it will be untouched upon the
150+
//.next resize(). In that light, resize from the start and
151+
// slightly reduce the size at the end if necessary.
152+
result.resize(capacity);
124153
}
125-
inline void put(Result& result, char c)
154+
CPPCODEC_ALWAYS_INLINE void put(Result& result, char c)
126155
{
127-
// This only compiles if decltype(data) == char*
128-
result.data()[m_offset++] = static_cast<char>(c);
156+
result.data()[m_offset++] = c;
129157
}
130-
inline void finish(Result& result)
158+
CPPCODEC_ALWAYS_INLINE void finish(Result& result)
131159
{
132160
result.resize(m_offset);
133161
}
134-
inline size_t size(const Result&)
162+
CPPCODEC_ALWAYS_INLINE size_t size(const Result&)
135163
{
136164
return m_offset;
137165
}
@@ -142,23 +170,123 @@ template <typename Result> class direct_data_access_result_state : empty_result_
142170
// SFINAE: Select a specific state based on the result type and possible result state type.
143171
// Implement this if direct data access (`result.data()[0] = 'x') isn't already possible
144172
// and you want to specialize it for your own result type.
145-
template <typename Result, typename ResultState =
146-
typename direct_data_access_result_state<Result>::result_type::value>
147-
inline ResultState create_state(Result&, specific_t) { return ResultState(); }
173+
// Note: The enable_if should ideally be part of the class declaration,
174+
// but Visual Studio C++ will not compile it that way.
175+
// Have it here in the factory function instead.
176+
template <typename Result,
177+
typename = typename std::enable_if<
178+
data_is_mutable((Result*)nullptr)>::type>
179+
CPPCODEC_ALWAYS_INLINE direct_data_access_result_state<Result> create_state(Result&, specific_t)
180+
{
181+
return direct_data_access_result_state<Result>();
182+
}
148183

184+
static_assert(std::is_same<
185+
decltype(create_state(*(std::vector<uint8_t>*)nullptr, specific_t())),
186+
direct_data_access_result_state<std::vector<uint8_t>>>::value,
187+
"std::vector<uint8_t> must be handled by direct_data_access_result_state");
188+
189+
// Specialized init(), put() and finish() functions for direct_data_access_result_state.
149190
template <typename Result>
150-
inline void init(Result& result, direct_data_access_result_state<Result>& state, size_t capacity)
191+
CPPCODEC_ALWAYS_INLINE void init(Result& result, direct_data_access_result_state<Result>& state, size_t capacity)
151192
{
152-
state.init(result);
193+
state.init(result, capacity);
153194
}
154195

155-
// Specialized put function for direct_data_access_result_state.
156196
template <typename Result>
157-
inline void put(Result& result, direct_data_access_result_state<Result>& state, char c)
197+
CPPCODEC_ALWAYS_INLINE void put(Result& result, direct_data_access_result_state<Result>& state, char c)
158198
{
159199
state.put(result, c);
160200
}
161201

202+
template <typename Result>
203+
CPPCODEC_ALWAYS_INLINE void finish(Result& result, direct_data_access_result_state<Result>& state)
204+
{
205+
state.finish(result);
206+
}
207+
208+
//
209+
// Specialization for container types with direct mutable array access,
210+
// e.g. std::string. This is generally faster because bound checks are
211+
// minimal and operator[] is more likely noexcept. In addition,
212+
// std::string::push_back() needs to write a null character on every
213+
// expansion, which should be more efficient when done in bulk by resize().
214+
//
215+
// Compared to the above, tracking an extra offset variable is cheap.
216+
//
217+
218+
template <typename T>
219+
constexpr auto array_access_is_mutable(T* t) -> decltype((*t)[size_t(0)] = 'x', bool())
220+
{
221+
return true;
222+
}
223+
constexpr bool array_access_is_mutable(...) { return false; }
224+
225+
template <typename Result>
226+
class array_access_result_state
227+
{
228+
public:
229+
CPPCODEC_ALWAYS_INLINE void init(Result& result, size_t capacity)
230+
{
231+
// reserve() may not actually allocate the storage right away,
232+
// and it isn't guaranteed that it will be untouched upon the
233+
//.next resize(). In that light, resize from the start and
234+
// slightly reduce the size at the end if necessary.
235+
result.resize(capacity);
236+
}
237+
CPPCODEC_ALWAYS_INLINE void put(Result& result, char c)
238+
{
239+
result[m_offset++] = c;
240+
}
241+
CPPCODEC_ALWAYS_INLINE void finish(Result& result)
242+
{
243+
result.resize(m_offset);
244+
}
245+
CPPCODEC_ALWAYS_INLINE size_t size(const Result&)
246+
{
247+
return m_offset;
248+
}
249+
private:
250+
size_t m_offset = 0;
251+
};
252+
253+
// SFINAE: Select a specific state based on the result type and possible result state type.
254+
// Note: The enable_if should ideally be part of the class declaration,
255+
// but Visual Studio C++ will not compile it that way.
256+
// Have it here in the factory function instead.
257+
template <typename Result,
258+
typename = typename std::enable_if<
259+
!data_is_mutable((Result*)nullptr) // no more than one template option
260+
&& array_access_is_mutable((Result*)nullptr)>::type>
261+
CPPCODEC_ALWAYS_INLINE array_access_result_state<Result> create_state(Result&, specific_t)
262+
{
263+
return array_access_result_state<Result>();
264+
}
265+
266+
static_assert(std::is_same<
267+
decltype(create_state(*(std::string*)nullptr, specific_t())),
268+
array_access_result_state<std::string>>::value,
269+
"std::string must be handled by array_access_result_state");
270+
271+
// Specialized init(), put() and finish() functions for array_access_result_state.
272+
template <typename Result>
273+
CPPCODEC_ALWAYS_INLINE void init(Result& result, array_access_result_state<Result>& state, size_t capacity)
274+
{
275+
state.init(result, capacity);
276+
}
277+
278+
template <typename Result>
279+
CPPCODEC_ALWAYS_INLINE void put(Result& result, array_access_result_state<Result>& state, char c)
280+
{
281+
state.put(result, c);
282+
}
283+
284+
template <typename Result>
285+
CPPCODEC_ALWAYS_INLINE void finish(Result& result, array_access_result_state<Result>& state)
286+
{
287+
state.finish(result);
288+
}
289+
162290
// char_data() is only used to read, not for result buffers.
163291
template <typename T> inline const char* char_data(const T& t)
164292
{

0 commit comments

Comments
 (0)