diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 36c3a483d..9954afdb7 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -94,6 +94,7 @@ set(ICEBERG_SOURCES util/timepoint.cc util/truncate_util.cc util/type_util.cc + util/url_encoder.cc util/uuid.cc) set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS) diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 3929e1803..cc0991da6 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -116,6 +116,7 @@ iceberg_sources = files( 'util/timepoint.cc', 'util/truncate_util.cc', 'util/type_util.cc', + 'util/url_encoder.cc', 'util/uuid.cc', ) diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 30a473fd2..71bf90182 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -108,6 +108,7 @@ add_iceberg_test(util_test location_util_test.cc string_util_test.cc truncate_util_test.cc + url_encoder_test.cc uuid_test.cc visit_type_test.cc) diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 378182819..50422ccc9 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -88,6 +88,7 @@ iceberg_tests = { 'location_util_test.cc', 'string_util_test.cc', 'truncate_util_test.cc', + 'url_encoder_test.cc', 'uuid_test.cc', 'visit_type_test.cc', ), diff --git a/src/iceberg/test/url_encoder_test.cc b/src/iceberg/test/url_encoder_test.cc new file mode 100644 index 000000000..fe22a9bb0 --- /dev/null +++ b/src/iceberg/test/url_encoder_test.cc @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/url_encoder.h" + +#include + +#include "iceberg/test/matchers.h" + +namespace iceberg { + +TEST(UrlEncoderTest, Encode) { + // RFC 3986 unreserved characters should not be encoded + EXPECT_THAT(UrlEncoder::Encode("abc123XYZ"), ::testing::Eq("abc123XYZ")); + EXPECT_THAT(UrlEncoder::Encode("test-file_name.txt~backup"), + ::testing::Eq("test-file_name.txt~backup")); + + // Spaces and special characters should be encoded + EXPECT_THAT(UrlEncoder::Encode("hello world"), ::testing::Eq("hello%20world")); + EXPECT_THAT(UrlEncoder::Encode("test@example.com"), + ::testing::Eq("test%40example.com")); + EXPECT_THAT(UrlEncoder::Encode("path/to/file"), ::testing::Eq("path%2Fto%2Ffile")); + EXPECT_THAT(UrlEncoder::Encode("key=value&foo=bar"), + ::testing::Eq("key%3Dvalue%26foo%3Dbar")); + EXPECT_THAT(UrlEncoder::Encode("100%"), ::testing::Eq("100%25")); + EXPECT_THAT(UrlEncoder::Encode("hello\x1fworld"), ::testing::Eq("hello%1Fworld")); + EXPECT_THAT(UrlEncoder::Encode(""), ::testing::Eq("")); +} + +TEST(UrlEncoderTest, Decode) { + // Decode percent-encoded strings + EXPECT_THAT(UrlEncoder::Decode("hello%20world"), ::testing::Eq("hello world")); + EXPECT_THAT(UrlEncoder::Decode("test%40example.com"), + ::testing::Eq("test@example.com")); + EXPECT_THAT(UrlEncoder::Decode("path%2fto%2Ffile"), ::testing::Eq("path/to/file")); + EXPECT_THAT(UrlEncoder::Decode("key%3dvalue%26foo%3Dbar"), + ::testing::Eq("key=value&foo=bar")); + EXPECT_THAT(UrlEncoder::Decode("100%25"), ::testing::Eq("100%")); + + // ASCII Unit Separator (0x1F) + EXPECT_THAT(UrlEncoder::Decode("hello%1Fworld"), ::testing::Eq("hello\x1Fworld")); + + // Unreserved characters remain unchanged + EXPECT_THAT(UrlEncoder::Decode("test-file_name.txt~backup"), + ::testing::Eq("test-file_name.txt~backup")); + EXPECT_THAT(UrlEncoder::Decode(""), ::testing::Eq("")); +} + +TEST(UrlEncoderTest, EncodeDecodeRoundTrip) { + std::vector test_cases = {"hello world", + "test@example.com", + "path/to/file", + "key=value&foo=bar", + "100%", + "hello\x1Fworld", + "special!@#$%^&*()chars", + "mixed-123_test.file~ok", + ""}; + + for (const auto& test : test_cases) { + std::string encoded = UrlEncoder::Encode(test); + std::string decoded = UrlEncoder::Decode(encoded); + EXPECT_EQ(decoded, test) << "Round-trip failed for: " << test; + } +} + +} // namespace iceberg diff --git a/src/iceberg/util/meson.build b/src/iceberg/util/meson.build index 880f63401..b3866b705 100644 --- a/src/iceberg/util/meson.build +++ b/src/iceberg/util/meson.build @@ -38,6 +38,7 @@ install_headers( 'timepoint.h', 'truncate_util.h', 'type_util.h', + 'url_encoder.h', 'uuid.h', 'visitor_generate.h', 'visit_type.h', diff --git a/src/iceberg/util/url_encoder.cc b/src/iceberg/util/url_encoder.cc new file mode 100644 index 000000000..a6ae4fd86 --- /dev/null +++ b/src/iceberg/util/url_encoder.cc @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/url_encoder.h" + +#include + +namespace iceberg { + +namespace { + +bool IsUnreserved(unsigned char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + c == '-' || c == '.' || c == '_' || c == '~'; +} + +// Helper: convert hex char to int (0–15), returns -1 if invalid +constexpr int8_t FromHex(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + return -1; +} + +} // namespace + +std::string UrlEncoder::Encode(std::string_view str_to_encode) { + static const char* kHexChars = "0123456789ABCDEF"; + std::string result; + result.reserve(str_to_encode.size() * 3 / 2 /* Heuristic reservation */); + + for (char c : str_to_encode) { + if (IsUnreserved(c)) { + result += c; + } else { + result += '%'; + result += kHexChars[c >> 4]; + result += kHexChars[c & 0xF]; + } + } + + return result; +} + +std::string UrlEncoder::Decode(std::string_view str_to_decode) { + std::string result; + result.reserve(str_to_decode.size()); + + for (size_t i = 0; i < str_to_decode.size(); ++i) { + char c = str_to_decode[i]; + if (c == '%' && i + 2 < str_to_decode.size()) { + int8_t hi = FromHex(str_to_decode[i + 1]); + int8_t lo = FromHex(str_to_decode[i + 2]); + + if (hi != -1 && lo != -1) { + result += static_cast((hi << 4) | lo); + i += 2; + continue; + } + } + // Not a valid %XX sequence, copy as-is + result += c; + } + + return result; +} + +} // namespace iceberg diff --git a/src/iceberg/util/url_encoder.h b/src/iceberg/util/url_encoder.h new file mode 100644 index 000000000..c6b8dd707 --- /dev/null +++ b/src/iceberg/util/url_encoder.h @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/iceberg_export.h" + +/// \file iceberg/util/url_encoder.h +/// \brief URL encoding and decoding. + +namespace iceberg { + +/// \brief Utilities for encoding and decoding URLs. +class ICEBERG_EXPORT UrlEncoder { + public: + /// \brief URL-encode a string. + /// + /// \details This is a simple implementation of url-encode + /// - Unreserved characters: [A-Z], [a-z], [0-9], "-", "_", ".", "~" + /// - Space is encoded as "%20" (unlike Java's URLEncoder which uses "+"). + /// - All other characters are percent-encoded (%XX). + /// \param str_to_encode The string to encode. + /// \return The URL-encoded string. + static std::string Encode(std::string_view str_to_encode); + + /// \brief URL-decode a string. + /// + /// \details Decodes percent-encoded characters (e.g., "%20" -> space). + /// \param str_to_decode The encoded string to decode. + /// \return The decoded string. + static std::string Decode(std::string_view str_to_decode); +}; + +} // namespace iceberg