diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index 3ca8124aa..bd8e8593f 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -208,6 +208,43 @@ bool isHarmlessUriChar(char c) return false; } +bool mustBeUriEncodedFor(kiwix::URIComponentKind target, char c) +{ + if (isHarmlessUriChar(c)) + return false; + + switch (c) { + case '/': // There is no reason to encode the path separator in the general + // case. It must be encoded only in a path component when its + // semantics of a path separator has to be suppressed. + return false; + + case '@': // In a relative URL of the form abc@def/xyz (with no / in abc) + // a non-encoded @ will make "abc" and "def" to be interpreted as + // username and host components, respectively + return target == kiwix::URIComponentKind::PATH; + + case ':': // In a relative URL of the form abc:def/xyz (with no / in abc) + // a non-encoded : will make "abc" and "def" to be interpreted as + // host and port components, respectively + return target == kiwix::URIComponentKind::PATH; + + case '?': // A non-encoded '?' acts as a separator between the path + // and query components + return target == kiwix::URIComponentKind::PATH; + + case '&': return target == kiwix::URIComponentKind::QUERY; + case '=': return target == kiwix::URIComponentKind::QUERY; + case '+': return target == kiwix::URIComponentKind::QUERY; + + case '#': // A non-encoded '#' in either path or query-component + // would mark the beginning of the fragment component + return true; + } + + return true; +} + int hexToInt(char c) { switch (c) { case '0': return 0; @@ -247,6 +284,26 @@ std::string kiwix::urlEncode(const std::string& value) return os.str(); } +namespace kiwix +{ + +std::string uriEncode(URIComponentKind target, const std::string& value) +{ + std::ostringstream os; + os << std::hex << std::uppercase; + for (const char c : value) { + if ( mustBeUriEncodedFor(target, c) ) { + const unsigned int charVal = static_cast(c); + os << '%' << std::setw(2) << std::setfill('0') << charVal; + } else { + os << c; + } + } + return os.str(); +} + +} // namespace kiwix + std::string kiwix::urlDecode(const std::string& value, bool component) { std::ostringstream os; diff --git a/src/tools/stringTools.h b/src/tools/stringTools.h index 1f55a22bc..d621d5af9 100644 --- a/src/tools/stringTools.h +++ b/src/tools/stringTools.h @@ -60,6 +60,17 @@ private: std::string urlEncode(const std::string& value); std::string urlDecode(const std::string& value, bool component = false); +// Only URI components that are of interest to libkiwix +// are included in the below enumeration type +enum class URIComponentKind +{ + PATH, + QUERY +}; + +// Encode 'value' for usage in a URI componenet specified by 'target' +std::string uriEncode(URIComponentKind target, const std::string& value); + std::string join(const std::vector& list, const std::string& sep); std::string ucAll(const std::string& word); diff --git a/test/stringTools.cpp b/test/stringTools.cpp index 27cc712b8..5aca1acfd 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -163,4 +163,35 @@ TEST(stringTools, urlDecode) EXPECT_EQ(urlDecode(encodedUriDelimSymbols, false), encodedUriDelimSymbols); } -}; +TEST(stringTools, uriEncode) +{ + const char letters[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; + EXPECT_EQ(uriEncode(URIComponentKind::PATH, letters), letters); + EXPECT_EQ(uriEncode(URIComponentKind::QUERY, letters), letters); + + const char digits[] = "0123456789"; + EXPECT_EQ(uriEncode(URIComponentKind::PATH, digits), digits); + EXPECT_EQ(uriEncode(URIComponentKind::QUERY, digits), digits); + + const char nonEncodableSymbols[] = ".-_~()*!/"; + EXPECT_EQ(uriEncode(URIComponentKind::PATH, nonEncodableSymbols), nonEncodableSymbols); + EXPECT_EQ(uriEncode(URIComponentKind::QUERY, nonEncodableSymbols), nonEncodableSymbols); + + const char uriDelimSymbols[] = ":@?=+&#$;,"; + EXPECT_EQ(uriEncode(URIComponentKind::PATH, uriDelimSymbols), "%3A%40%3F=+&%23%24%3B%2C"); + EXPECT_EQ(uriEncode(URIComponentKind::QUERY, uriDelimSymbols), ":@?%3D%2B%26%23%24%3B%2C"); + + const char otherSymbols[] = R"(`%^[]{}\|"<>)"; + EXPECT_EQ(uriEncode(URIComponentKind::PATH, otherSymbols), "%60%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); + EXPECT_EQ(uriEncode(URIComponentKind::PATH, otherSymbols), uriEncode(URIComponentKind::QUERY, otherSymbols)); + + const char whitespace[] = " \n\t\r"; + EXPECT_EQ(uriEncode(URIComponentKind::PATH, whitespace), "%20%0A%09%0D"); + EXPECT_EQ(uriEncode(URIComponentKind::PATH, whitespace), uriEncode(URIComponentKind::QUERY, whitespace)); + + const char someNonASCIIChars[] = "Σ♂♀ツ"; + EXPECT_EQ(uriEncode(URIComponentKind::PATH, someNonASCIIChars), "%CE%A3%E2%99%82%E2%99%80%E3%83%84"); + EXPECT_EQ(uriEncode(URIComponentKind::PATH, someNonASCIIChars), uriEncode(URIComponentKind::QUERY, someNonASCIIChars)); +} + +} // unnamed namespace