From 4e9ce9d3c0edd7431a3eb0dc718091d914e7321d Mon Sep 17 00:00:00 2001 From: Zhuo Wang Date: Wed, 31 Dec 2025 14:35:54 +0800 Subject: [PATCH 1/3] feat: a simple impl of url encoder --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/meson.build | 1 + src/iceberg/test/CMakeLists.txt | 1 + src/iceberg/test/meson.build | 1 + src/iceberg/test/url_encoder_test.cc | 83 ++++++++++++++++++++++++++++ src/iceberg/util/meson.build | 1 + src/iceberg/util/url_encoder.cc | 75 +++++++++++++++++++++++++ src/iceberg/util/url_encoder.h | 54 ++++++++++++++++++ 8 files changed, 217 insertions(+) create mode 100644 src/iceberg/test/url_encoder_test.cc create mode 100644 src/iceberg/util/url_encoder.cc create mode 100644 src/iceberg/util/url_encoder.h diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 36c3a483d..9954afdb7 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -94,6 +94,7 @@ set(ICEBERG_SOURCES util/timepoint.cc util/truncate_util.cc util/type_util.cc + util/url_encoder.cc util/uuid.cc) set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS) diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 3929e1803..cc0991da6 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -116,6 +116,7 @@ iceberg_sources = files( 'util/timepoint.cc', 'util/truncate_util.cc', 'util/type_util.cc', + 'util/url_encoder.cc', 'util/uuid.cc', ) diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 30a473fd2..71bf90182 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -108,6 +108,7 @@ add_iceberg_test(util_test location_util_test.cc string_util_test.cc truncate_util_test.cc + url_encoder_test.cc uuid_test.cc visit_type_test.cc) diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 378182819..50422ccc9 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -88,6 +88,7 @@ iceberg_tests = { 'location_util_test.cc', 'string_util_test.cc', 'truncate_util_test.cc', + 'url_encoder_test.cc', 'uuid_test.cc', 'visit_type_test.cc', ), diff --git a/src/iceberg/test/url_encoder_test.cc b/src/iceberg/test/url_encoder_test.cc new file mode 100644 index 000000000..fe22a9bb0 --- /dev/null +++ b/src/iceberg/test/url_encoder_test.cc @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/url_encoder.h" + +#include + +#include "iceberg/test/matchers.h" + +namespace iceberg { + +TEST(UrlEncoderTest, Encode) { + // RFC 3986 unreserved characters should not be encoded + EXPECT_THAT(UrlEncoder::Encode("abc123XYZ"), ::testing::Eq("abc123XYZ")); + EXPECT_THAT(UrlEncoder::Encode("test-file_name.txt~backup"), + ::testing::Eq("test-file_name.txt~backup")); + + // Spaces and special characters should be encoded + EXPECT_THAT(UrlEncoder::Encode("hello world"), ::testing::Eq("hello%20world")); + EXPECT_THAT(UrlEncoder::Encode("test@example.com"), + ::testing::Eq("test%40example.com")); + EXPECT_THAT(UrlEncoder::Encode("path/to/file"), ::testing::Eq("path%2Fto%2Ffile")); + EXPECT_THAT(UrlEncoder::Encode("key=value&foo=bar"), + ::testing::Eq("key%3Dvalue%26foo%3Dbar")); + EXPECT_THAT(UrlEncoder::Encode("100%"), ::testing::Eq("100%25")); + EXPECT_THAT(UrlEncoder::Encode("hello\x1fworld"), ::testing::Eq("hello%1Fworld")); + EXPECT_THAT(UrlEncoder::Encode(""), ::testing::Eq("")); +} + +TEST(UrlEncoderTest, Decode) { + // Decode percent-encoded strings + EXPECT_THAT(UrlEncoder::Decode("hello%20world"), ::testing::Eq("hello world")); + EXPECT_THAT(UrlEncoder::Decode("test%40example.com"), + ::testing::Eq("test@example.com")); + EXPECT_THAT(UrlEncoder::Decode("path%2fto%2Ffile"), ::testing::Eq("path/to/file")); + EXPECT_THAT(UrlEncoder::Decode("key%3dvalue%26foo%3Dbar"), + ::testing::Eq("key=value&foo=bar")); + EXPECT_THAT(UrlEncoder::Decode("100%25"), ::testing::Eq("100%")); + + // ASCII Unit Separator (0x1F) + EXPECT_THAT(UrlEncoder::Decode("hello%1Fworld"), ::testing::Eq("hello\x1Fworld")); + + // Unreserved characters remain unchanged + EXPECT_THAT(UrlEncoder::Decode("test-file_name.txt~backup"), + ::testing::Eq("test-file_name.txt~backup")); + EXPECT_THAT(UrlEncoder::Decode(""), ::testing::Eq("")); +} + +TEST(UrlEncoderTest, EncodeDecodeRoundTrip) { + std::vector test_cases = {"hello world", + "test@example.com", + "path/to/file", + "key=value&foo=bar", + "100%", + "hello\x1Fworld", + "special!@#$%^&*()chars", + "mixed-123_test.file~ok", + ""}; + + for (const auto& test : test_cases) { + std::string encoded = UrlEncoder::Encode(test); + std::string decoded = UrlEncoder::Decode(encoded); + EXPECT_EQ(decoded, test) << "Round-trip failed for: " << test; + } +} + +} // namespace iceberg diff --git a/src/iceberg/util/meson.build b/src/iceberg/util/meson.build index 880f63401..b3866b705 100644 --- a/src/iceberg/util/meson.build +++ b/src/iceberg/util/meson.build @@ -38,6 +38,7 @@ install_headers( 'timepoint.h', 'truncate_util.h', 'type_util.h', + 'url_encoder.h', 'uuid.h', 'visitor_generate.h', 'visit_type.h', diff --git a/src/iceberg/util/url_encoder.cc b/src/iceberg/util/url_encoder.cc new file mode 100644 index 000000000..c8ed4a264 --- /dev/null +++ b/src/iceberg/util/url_encoder.cc @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/url_encoder.h" + +namespace iceberg { + +namespace { +// Helper: convert hex char to int (0–15), returns -1 if invalid +constexpr int8_t FromHex(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + return -1; +} +} // namespace + +std::string UrlEncoder::Encode(std::string_view str_to_encode) { + static const char* hex_chars = "0123456789ABCDEF"; + std::string result; + result.reserve(str_to_encode.size() * 3 /* Worst case: every char becomes %XX */); + + for (unsigned char c : str_to_encode) { + if (std::isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') { + result += static_cast(c); + } else { + result += '%'; + result += hex_chars[c >> 4]; + result += hex_chars[c & 0xF]; + } + } + + return result; +} + +std::string UrlEncoder::Decode(std::string_view str_to_decode) { + std::string result; + result.reserve(str_to_decode.size()); + + for (size_t i = 0; i < str_to_decode.size(); ++i) { + char c = str_to_decode[i]; + if (c == '%' && i + 2 < str_to_decode.size()) { + int8_t hi = FromHex(str_to_decode[i + 1]); + int8_t lo = FromHex(str_to_decode[i + 2]); + + if (hi != -1 && lo != -1) { + result += static_cast((hi << 4) | lo); + i += 2; + continue; + } + } + // Not a valid %XX sequence, copy as-is + result += c; + } + + return result; +} + +} // namespace iceberg diff --git a/src/iceberg/util/url_encoder.h b/src/iceberg/util/url_encoder.h new file mode 100644 index 000000000..50a14c504 --- /dev/null +++ b/src/iceberg/util/url_encoder.h @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +/// \file iceberg/util/url_encoder.h +/// \brief URL encoding and decoding. + +namespace iceberg { + +/// \brief Utilities for encoding and decoding URLs. +class ICEBERG_EXPORT UrlEncoder { + public: + /// \brief URL-encode a string. + /// + /// \details This is a simple implementation of url-encode + /// - Unreserved characters: [A-Z], [a-z], [0-9], "-", "_", ".", "~" + /// - Space is encoded as "%20" (unlike Java's URLEncoder which uses "+"). + /// - All other characters are percent-encoded (%XX). + /// \param str_to_encode The string to encode. + /// \return The URL-encoded string. + static std::string Encode(std::string_view str_to_encode); + + /// \brief URL-decode a string. + /// + /// \details Decodes percent-encoded characters (e.g., "%20" -> space). + /// \param str_to_decode The encoded string to decode. + /// \return The decoded string. + static std::string Decode(std::string_view str_to_decode); +}; + +} // namespace iceberg From 383a74679abe68b8f8e297347393843e00904cf1 Mon Sep 17 00:00:00 2001 From: Zhuo Wang Date: Mon, 5 Jan 2026 16:41:51 +0800 Subject: [PATCH 2/3] add `IsUnreserved` func for encode check --- src/iceberg/util/url_encoder.cc | 17 +++++++++++++---- src/iceberg/util/url_encoder.h | 1 - 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/iceberg/util/url_encoder.cc b/src/iceberg/util/url_encoder.cc index c8ed4a264..b47c021b4 100644 --- a/src/iceberg/util/url_encoder.cc +++ b/src/iceberg/util/url_encoder.cc @@ -19,9 +19,17 @@ #include "iceberg/util/url_encoder.h" +#include + namespace iceberg { namespace { + +bool IsUnreserved(unsigned char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + c == '-' || c == '.' || c == '_' || c == '~'; +} + // Helper: convert hex char to int (0–15), returns -1 if invalid constexpr int8_t FromHex(char c) { if (c >= '0' && c <= '9') return c - '0'; @@ -29,16 +37,17 @@ constexpr int8_t FromHex(char c) { if (c >= 'a' && c <= 'f') return c - 'a' + 10; return -1; } + } // namespace std::string UrlEncoder::Encode(std::string_view str_to_encode) { static const char* hex_chars = "0123456789ABCDEF"; std::string result; - result.reserve(str_to_encode.size() * 3 /* Worst case: every char becomes %XX */); + result.reserve(str_to_encode.size() * 3 / 2 /* Heuristic reservation */); - for (unsigned char c : str_to_encode) { - if (std::isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') { - result += static_cast(c); + for (char c : str_to_encode) { + if (IsUnreserved(c)) { + result += c; } else { result += '%'; result += hex_chars[c >> 4]; diff --git a/src/iceberg/util/url_encoder.h b/src/iceberg/util/url_encoder.h index 50a14c504..c6b8dd707 100644 --- a/src/iceberg/util/url_encoder.h +++ b/src/iceberg/util/url_encoder.h @@ -23,7 +23,6 @@ #include #include "iceberg/iceberg_export.h" -#include "iceberg/result.h" /// \file iceberg/util/url_encoder.h /// \brief URL encoding and decoding. From fcb91a26bea727ca3bed51e8ad8f7c537314af48 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 5 Jan 2026 20:28:07 +0800 Subject: [PATCH 3/3] Update src/iceberg/util/url_encoder.cc --- src/iceberg/util/url_encoder.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/iceberg/util/url_encoder.cc b/src/iceberg/util/url_encoder.cc index b47c021b4..a6ae4fd86 100644 --- a/src/iceberg/util/url_encoder.cc +++ b/src/iceberg/util/url_encoder.cc @@ -41,7 +41,7 @@ constexpr int8_t FromHex(char c) { } // namespace std::string UrlEncoder::Encode(std::string_view str_to_encode) { - static const char* hex_chars = "0123456789ABCDEF"; + static const char* kHexChars = "0123456789ABCDEF"; std::string result; result.reserve(str_to_encode.size() * 3 / 2 /* Heuristic reservation */); @@ -50,8 +50,8 @@ std::string UrlEncoder::Encode(std::string_view str_to_encode) { result += c; } else { result += '%'; - result += hex_chars[c >> 4]; - result += hex_chars[c & 0xF]; + result += kHexChars[c >> 4]; + result += kHexChars[c & 0xF]; } }