From 07c7d3931d2d28381387250ab60a449d646f5883 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 17 Jan 2023 19:01:50 +0400 Subject: [PATCH 01/16] Added a unit-test of buggy urlEncode() Added a unit-test for urlEncode() that passes for its current implementation despite the two bugs that were revealed while creating the unit-test. --- test/stringTools.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/test/stringTools.cpp b/test/stringTools.cpp index 7402f6e0d..b6965f8fb 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -105,4 +105,48 @@ TEST(stringTools, extractFromString) ASSERT_THROW(extractFromString("3.14.5"), std::invalid_argument); } +namespace URLEncoding +{ + +const char letters[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; +const char digits[] = "0123456789"; +const char nonEncodableSymbols[] = ".-_~()*!"; +const char uriDelimSymbols[] = ":/@?=+&$;,"; + +// XXX: # should belong to uriDelimSymbols! +const char otherSymbols[] = R"(`#%^[]{}\|"<>)"; + +const char whitespace[] = " \n\t\r"; + +const char someNonASCIIChars[] = "Σ♂♀ツ"; + +} + +TEST(stringTools, urlEncode) +{ + using namespace URLEncoding; + + EXPECT_EQ(urlEncode(letters), letters); + EXPECT_EQ(urlEncode(letters, true), letters); + + EXPECT_EQ(urlEncode(digits), digits); + EXPECT_EQ(urlEncode(digits, true), digits); + + EXPECT_EQ(urlEncode(nonEncodableSymbols), nonEncodableSymbols); + EXPECT_EQ(urlEncode(nonEncodableSymbols, true), nonEncodableSymbols); + + EXPECT_EQ(urlEncode(uriDelimSymbols), uriDelimSymbols); + EXPECT_EQ(urlEncode(uriDelimSymbols, true), "%3A%2F%40%3F%3D%2B%26%24%3B%2C"); + + EXPECT_EQ(urlEncode(otherSymbols), "%60%23%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); + EXPECT_EQ(urlEncode(otherSymbols), urlEncode(otherSymbols, true)); + + // XXX: there is a bug with formatting of single-digit hex values + EXPECT_EQ(urlEncode(whitespace), "%20% A% 9% D"); + EXPECT_EQ(urlEncode(whitespace), urlEncode(whitespace, true)); + + EXPECT_EQ(urlEncode(someNonASCIIChars), "%CE%A3%E2%99%82%E2%99%80%E3%83%84"); + EXPECT_EQ(urlEncode(someNonASCIIChars), urlEncode(someNonASCIIChars, true)); +} + }; From e49081da809846097d87363b5a86440812521ee9 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 17 Jan 2023 19:11:37 +0400 Subject: [PATCH 02/16] Fixed urlEncode() for chars below 0x10 --- src/tools/stringTools.cpp | 3 ++- test/stringTools.cpp | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index dcd5fc4ff..8e81d1cf1 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -241,7 +241,8 @@ std::string kiwix::urlEncode(const std::string& value, bool encodeReserved) if (!needsEscape(*it, encodeReserved)) { os << *it; } else { - os << '%' << std::setw(2) << static_cast(static_cast(*it)); + const unsigned int charVal = static_cast(*it); + os << '%' << std::setw(2) << std::setfill('0') << charVal; } } return os.str(); diff --git a/test/stringTools.cpp b/test/stringTools.cpp index b6965f8fb..252264412 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -141,8 +141,7 @@ TEST(stringTools, urlEncode) EXPECT_EQ(urlEncode(otherSymbols), "%60%23%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); EXPECT_EQ(urlEncode(otherSymbols), urlEncode(otherSymbols, true)); - // XXX: there is a bug with formatting of single-digit hex values - EXPECT_EQ(urlEncode(whitespace), "%20% A% 9% D"); + EXPECT_EQ(urlEncode(whitespace), "%20%0A%09%0D"); EXPECT_EQ(urlEncode(whitespace), urlEncode(whitespace, true)); EXPECT_EQ(urlEncode(someNonASCIIChars), "%CE%A3%E2%99%82%E2%99%80%E3%83%84"); From 82d477009dff0a52b3b24ff8f5666044d1249350 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 17 Jan 2023 19:17:26 +0400 Subject: [PATCH 03/16] '#' is a URI delimiter symbol --- src/tools/stringTools.cpp | 1 + test/stringTools.cpp | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index 8e81d1cf1..021db592b 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -177,6 +177,7 @@ bool isReservedUrlChar(char c) case '=': case '+': case '$': + case '#': return true; default: return false; diff --git a/test/stringTools.cpp b/test/stringTools.cpp index 252264412..462517668 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -111,10 +111,9 @@ namespace URLEncoding const char letters[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; const char digits[] = "0123456789"; const char nonEncodableSymbols[] = ".-_~()*!"; -const char uriDelimSymbols[] = ":/@?=+&$;,"; +const char uriDelimSymbols[] = ":/@?=+&#$;,"; -// XXX: # should belong to uriDelimSymbols! -const char otherSymbols[] = R"(`#%^[]{}\|"<>)"; +const char otherSymbols[] = R"(`%^[]{}\|"<>)"; const char whitespace[] = " \n\t\r"; @@ -136,9 +135,9 @@ TEST(stringTools, urlEncode) EXPECT_EQ(urlEncode(nonEncodableSymbols, true), nonEncodableSymbols); EXPECT_EQ(urlEncode(uriDelimSymbols), uriDelimSymbols); - EXPECT_EQ(urlEncode(uriDelimSymbols, true), "%3A%2F%40%3F%3D%2B%26%24%3B%2C"); + EXPECT_EQ(urlEncode(uriDelimSymbols, true), "%3A%2F%40%3F%3D%2B%26%23%24%3B%2C"); - EXPECT_EQ(urlEncode(otherSymbols), "%60%23%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); + EXPECT_EQ(urlEncode(otherSymbols), "%60%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); EXPECT_EQ(urlEncode(otherSymbols), urlEncode(otherSymbols, true)); EXPECT_EQ(urlEncode(whitespace), "%20%0A%09%0D"); From aa2e443eb89b44077f6593236e4a6ee024776938 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 17 Jan 2023 19:24:26 +0400 Subject: [PATCH 04/16] Fixed indentation Replaced tabs with spaces. --- src/tools/stringTools.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index 021db592b..a953a2005 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -269,15 +269,15 @@ std::string kiwix::urlDecode(const std::string& value, bool component) int iHi = hexToInt(hi); int iLo = hexToInt(lo); if (iHi < 0 || iLo < 0) { - // Invalid escape sequence - os << '%' << hi << lo; - continue; + // Invalid escape sequence + os << '%' << hi << lo; + continue; } char c = (char)(iHi << 4 | iLo); if (!component && isReservedUrlChar(c)) { - os << '%' << hi << lo; + os << '%' << hi << lo; } else { - os << c; + os << c; } } else { os << *it; From 822fb3748af8b27837c4e2758ba820f7c5dfb038 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 18 Jan 2023 17:24:46 +0400 Subject: [PATCH 05/16] Added a unit-test for urlDecode() --- test/stringTools.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/stringTools.cpp b/test/stringTools.cpp index 462517668..890688756 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -147,4 +147,26 @@ TEST(stringTools, urlEncode) EXPECT_EQ(urlEncode(someNonASCIIChars), urlEncode(someNonASCIIChars, true)); } +TEST(stringTools, urlDecode) +{ + using namespace URLEncoding; + + const std::string allTestChars = std::string(letters) + + digits + + nonEncodableSymbols + + uriDelimSymbols + + otherSymbols + + whitespace + + someNonASCIIChars; + + for ( const char c : allTestChars ) { + const std::string str(1, c); + EXPECT_EQ(urlDecode(urlEncode(str)), str); + EXPECT_EQ(urlDecode(urlEncode(str, true), true), str); + } + + EXPECT_EQ(urlDecode(urlEncode(allTestChars)), allTestChars); + EXPECT_EQ(urlDecode(urlEncode(allTestChars, true), true), allTestChars); +} + }; From c5ccbd37e25366a1bf8848b00472bb2ca3422873 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 17 Jan 2023 19:41:33 +0400 Subject: [PATCH 06/16] Extracted isHarmlessUriChar() --- src/tools/stringTools.cpp | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index a953a2005..c0e42cb05 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -161,6 +161,9 @@ std::string kiwix::encodeDiples(const std::string& str) return result; } +namespace +{ + /* urlEncode() based on javascript encodeURI() & encodeURIComponent(). Mostly code from rstudio/httpuv (GPLv3) */ @@ -184,16 +187,15 @@ bool isReservedUrlChar(char c) } } -bool needsEscape(char c, bool encodeReserved) +bool isHarmlessUriChar(char c) { if (c >= 'a' && c <= 'z') - return false; + return true; if (c >= 'A' && c <= 'Z') - return false; + return true; if (c >= '0' && c <= '9') - return false; - if (isReservedUrlChar(c)) - return encodeReserved; + return true; + switch (c) { case '-': case '_': @@ -204,8 +206,19 @@ bool needsEscape(char c, bool encodeReserved) case '\'': case '(': case ')': - return false; + return true; } + return false; +} + +bool needsEscape(char c, bool encodeReserved) +{ + if (isHarmlessUriChar(c)) + return false; + + if (isReservedUrlChar(c)) + return encodeReserved; + return true; } @@ -231,6 +244,8 @@ int hexToInt(char c) { } } +} // unnamed namespace + std::string kiwix::urlEncode(const std::string& value, bool encodeReserved) { std::ostringstream os; From 239b108fa77c8dca9c44fc63aeb905a41a7e924b Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 18 Jan 2023 16:42:59 +0400 Subject: [PATCH 07/16] / is no longer a reserved char for urlEncode() This change is a quick hack solving known issues with URI-encoding in libkiwix. This change removes the slash character from the list of URL separator symbols in URL encoding/decoding utilities, and makes it a symbol that is safe to leave unencoded. Effects: - `urlEncode()` never encodes the '/' symbol (even when it is requested to encode the URL separator symbols too). - `urlDecode(str)`/`urlDecode(..., false)` will now decode %2F to '/'; other encoded URL separator symbols are NOT decoded when the second argument of `urlDecode()` is set to false (which is the default). --- src/tools/stringTools.cpp | 2 +- test/server.cpp | 4 ++-- test/stringTools.cpp | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index c0e42cb05..3477aaaee 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -172,7 +172,6 @@ bool isReservedUrlChar(char c) switch (c) { case ';': case ',': - case '/': case '?': case ':': case '@': @@ -206,6 +205,7 @@ bool isHarmlessUriChar(char c) case '\'': case '(': case ')': + case '/': return true; } return false; diff --git a/test/server.cpp b/test/server.cpp index 78f56ecdc..29f1885b1 100644 --- a/test/server.cpp +++ b/test/server.cpp @@ -1156,7 +1156,7 @@ TEST_F(ServerTest, RandomPageRedirectsToAnExistingArticle) auto g = zfs1_->GET("/ROOT/random?content=zimfile"); ASSERT_EQ(302, g->status); ASSERT_TRUE(g->has_header("Location")); - ASSERT_TRUE(kiwix::startsWith(g->get_header_value("Location"), "/ROOT/content/zimfile/A%2F")); + ASSERT_TRUE(kiwix::startsWith(g->get_header_value("Location"), "/ROOT/content/zimfile/A/")); ASSERT_EQ(getCacheControlHeader(*g), "max-age=0, must-revalidate"); ASSERT_FALSE(g->has_header("ETag")); } @@ -1224,7 +1224,7 @@ TEST_F(ServerTest, BookMainPageIsRedirectedToArticleIndex) auto g = zfs1_->GET("/ROOT/content/zimfile"); ASSERT_EQ(302, g->status); ASSERT_TRUE(g->has_header("Location")); - ASSERT_EQ("/ROOT/content/zimfile/A%2Findex", g->get_header_value("Location")); + ASSERT_EQ("/ROOT/content/zimfile/A/index", g->get_header_value("Location")); } } diff --git a/test/stringTools.cpp b/test/stringTools.cpp index 890688756..102e9bf0d 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -110,8 +110,8 @@ namespace URLEncoding const char letters[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; const char digits[] = "0123456789"; -const char nonEncodableSymbols[] = ".-_~()*!"; -const char uriDelimSymbols[] = ":/@?=+&#$;,"; +const char nonEncodableSymbols[] = ".-_~()*!/"; +const char uriDelimSymbols[] = ":@?=+&#$;,"; const char otherSymbols[] = R"(`%^[]{}\|"<>)"; @@ -135,7 +135,7 @@ TEST(stringTools, urlEncode) EXPECT_EQ(urlEncode(nonEncodableSymbols, true), nonEncodableSymbols); EXPECT_EQ(urlEncode(uriDelimSymbols), uriDelimSymbols); - EXPECT_EQ(urlEncode(uriDelimSymbols, true), "%3A%2F%40%3F%3D%2B%26%23%24%3B%2C"); + EXPECT_EQ(urlEncode(uriDelimSymbols, true), "%3A%40%3F%3D%2B%26%23%24%3B%2C"); EXPECT_EQ(urlEncode(otherSymbols), "%60%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); EXPECT_EQ(urlEncode(otherSymbols), urlEncode(otherSymbols, true)); From 0bde4d94125b622856fec9204a406a0d21ad7448 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 16:02:27 +0400 Subject: [PATCH 08/16] Properly URI-encoded links in search results Special URI symbols occurring in the item path part of the search result link were NOT encoded, because that would also encode the path separator (/) symbol. Now that `urlEncode()` never encodes the / symbol, it is safe to encode all other URI-special symbols in the path. --- src/search_renderer.cpp | 5 +++-- test/server_search.cpp | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/search_renderer.cpp b/src/search_renderer.cpp index 7b293f628..381354bc6 100644 --- a/src/search_renderer.cpp +++ b/src/search_renderer.cpp @@ -171,9 +171,10 @@ std::string SearchRenderer::renderTemplate(const std::string& tmpl_str) kainjow::mustache::data items{kainjow::mustache::data::type::list}; for (auto it = m_srs.begin(); it != m_srs.end(); it++) { kainjow::mustache::data result; - std::string zim_id(it.getZimId()); + const std::string zim_id(it.getZimId()); + const auto path = mp_nameMapper->getNameForId(zim_id) + "/" + it.getPath(); result.set("title", it.getTitle()); - result.set("absolutePath", absPathPrefix + urlEncode(mp_nameMapper->getNameForId(zim_id), true) + "/" + urlEncode(it.getPath())); + result.set("absolutePath", absPathPrefix + urlEncode(path, true)); result.set("snippet", it.getSnippet()); if (mp_library) { result.set("bookTitle", mp_library->getBookById(zim_id).getTitle()); diff --git a/test/server_search.cpp b/test/server_search.cpp index 3932602ca..e8af5f348 100644 --- a/test/server_search.cpp +++ b/test/server_search.cpp @@ -196,7 +196,7 @@ struct SearchResult const std::vector LARGE_SEARCH_RESULTS = { SEARCH_RESULT( - /*link*/ "/ROOT/content/zimfile/A/Genius_+_Soul_=_Jazz", + /*link*/ "/ROOT/content/zimfile/A/Genius_%2B_Soul_%3D_Jazz", /*title*/ "Genius + Soul = Jazz", /*snippet*/ R"SNIPPET(...Grammy Hall of Fame in 2011. It was re-issued in the UK, first in 1989 on the Castle Communications "Essential Records" label, and by Rhino Records in 1997 on a single CD together with Charles' 1970 My Kind of Jazz. In 2010, Concord Records released a deluxe edition comprising digitally remastered versions of Genius + Soul = Jazz, My Kind of Jazz, Jazz Number II, and My Kind of Jazz Part 3. Professional ratings Review scores Source Rating Allmusic link Warr.org link Encyclopedia of Popular Music...)SNIPPET", /*bookTitle*/ "Ray Charles", @@ -236,7 +236,7 @@ const std::vector LARGE_SEARCH_RESULTS = { ), SEARCH_RESULT( - /*link*/ "/ROOT/content/zimfile/A/Catchin'_Some_Rays:_The_Music_of_Ray_Charles", + /*link*/ "/ROOT/content/zimfile/A/Catchin'_Some_Rays%3A_The_Music_of_Ray_Charles", /*title*/ "Catchin' Some Rays: The Music of Ray Charles", /*snippet*/ R"SNIPPET(...jazz singer Roseanna Vitro, released in August 1997 on the Telarc Jazz label. Catchin' Some Rays: The Music of Ray Charles Studio album by Roseanna Vitro Released August 1997 Recorded March 26, 1997 at Sound on Sound, NYC April 4,1997 at Quad Recording Studios, NYC Genre Vocal jazz Length 61:00 Label Telarc Jazz CD-83419 Producer Paul Wickliffe Roseanna Vitro chronology Passion Dance (1996) Catchin' Some Rays: The Music of Ray Charles (1997) The Time of My Life: Roseanna Vitro Sings the Songs of......)SNIPPET", /*bookTitle*/ "Ray Charles", @@ -244,7 +244,7 @@ const std::vector LARGE_SEARCH_RESULTS = { ), SEARCH_RESULT( - /*link*/ "/ROOT/content/zimfile/A/That's_What_I_Say:_John_Scofield_Plays_the_Music_of_Ray_Charles", + /*link*/ "/ROOT/content/zimfile/A/That's_What_I_Say%3A_John_Scofield_Plays_the_Music_of_Ray_Charles", /*title*/ "That's What I Say: John Scofield Plays the Music of Ray Charles", /*snippet*/ R"SNIPPET(That's What I Say: John Scofield Plays the Music of Ray Charles Studio album by John Scofield Released June 7, 2005 (2005-06-07) Recorded December 2004 Studio Avatar Studios, New York City Genre Jazz Length 65:21 Label Verve Producer Steve Jordan John Scofield chronology EnRoute: John Scofield Trio LIVE (2004) That's What I Say: John Scofield Plays the Music of Ray Charles (2005) Out Louder (2006) Professional ratings Review scores Source Rating Allmusic All About Jazz All About Jazz...)SNIPPET", /*bookTitle*/ "Ray Charles", @@ -284,7 +284,7 @@ const std::vector LARGE_SEARCH_RESULTS = { ), SEARCH_RESULT( - /*link*/ "/ROOT/content/zimfile/A/Here_We_Go_Again:_Celebrating_the_Genius_of_Ray_Charles", + /*link*/ "/ROOT/content/zimfile/A/Here_We_Go_Again%3A_Celebrating_the_Genius_of_Ray_Charles", /*title*/ "Here We Go Again: Celebrating the Genius of Ray Charles", /*snippet*/ R"SNIPPET(...and jazz trumpeter Wynton Marsalis. It was recorded during concerts at the Rose Theater in New York City, on February 9 and 10, 2009. The album received mixed reviews, in which the instrumentation of Marsalis' orchestra was praised by the critics. Here We Go Again: Celebrating the Genius of Ray Charles Live album by Willie Nelson and Wynton Marsalis Released March 29, 2011 (2011-03-29) Recorded February 9 –10 2009 Venue Rose Theater, New York Genre Jazz, country Length 61:49 Label Blue Note......)SNIPPET", /*bookTitle*/ "Ray Charles", @@ -356,7 +356,7 @@ const std::vector LARGE_SEARCH_RESULTS = { ), SEARCH_RESULT( - /*link*/ "/ROOT/content/zimfile/A/Ray_Sings,_Basie_Swings", + /*link*/ "/ROOT/content/zimfile/A/Ray_Sings%2C_Basie_Swings", /*title*/ "Ray Sings, Basie Swings", /*snippet*/ R"SNIPPET(...from 1973 with newly recorded instrumental tracks by the contemporary Count Basie Orchestra. Professional ratings Review scores Source Rating AllMusic Ray Sings, Basie Swings Compilation album by Ray Charles, Count Basie Orchestra Released October 3, 2006 (2006-10-03) Recorded Mid-1970s, February - May 2006 Studio Los Angeles Genre Soul, jazz, Swing Label Concord/Hear Music Producer Gregg Field Ray Charles chronology Genius & Friends (2005) Ray Sings, Basie Swings (2006) Rare Genius: The Undiscovered Masters (2010)...)SNIPPET", /*bookTitle*/ "Ray Charles", From bad13d76b4996bf607197645e97ef9a7f62e9e92 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 16:26:46 +0400 Subject: [PATCH 09/16] Removed unused code --- src/tools/otherTools.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/tools/otherTools.cpp b/src/tools/otherTools.cpp index 37345e7d1..243a9a00a 100644 --- a/src/tools/otherTools.cpp +++ b/src/tools/otherTools.cpp @@ -322,9 +322,6 @@ kainjow::mustache::data kiwix::onlyAsNonEmptyMustacheValue(const std::string& s) std::string kiwix::render_template(const std::string& template_str, kainjow::mustache::data data) { kainjow::mustache::mustache tmpl(template_str); - kainjow::mustache::data urlencode{kainjow::mustache::lambda2{ - [](const std::string& str,const kainjow::mustache::renderer& r) { return urlEncode(r(str), true); }}}; - data.set("urlencoded", urlencode); std::stringstream ss; tmpl.render(data, [&ss](const std::string& str) { ss << str; }); return ss.str(); From 772243e83291b9eb2c9a8d494ef3240577dffde5 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 16:33:28 +0400 Subject: [PATCH 10/16] Category name is fully URI-encoded --- src/opds_dumper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/opds_dumper.cpp b/src/opds_dumper.cpp index fc1f01c1b..072c3ce79 100644 --- a/src/opds_dumper.cpp +++ b/src/opds_dumper.cpp @@ -241,7 +241,7 @@ std::string OPDSDumper::categoriesOPDSFeed() const const auto now = gen_date_str(); kainjow::mustache::list categoryData; for ( const auto& category : library->getBooksCategories() ) { - const auto urlencodedCategoryName = urlEncode(category); + const auto urlencodedCategoryName = urlEncode(category, true); categoryData.push_back(kainjow::mustache::object{ {"name", category}, {"urlencoded_name", urlencodedCategoryName}, From 63e0d5c7c280d60483a2fd2fbff71b84ac8056af Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 16:52:36 +0400 Subject: [PATCH 11/16] RequestContext::get_query() is fully URI-encoded --- src/opds_dumper.cpp | 2 +- src/server/request_context.cpp | 4 ++-- test/library_server.cpp | 22 +++++++++++----------- test/server.cpp | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/opds_dumper.cpp b/src/opds_dumper.cpp index 072c3ce79..d4719b4a8 100644 --- a/src/opds_dumper.cpp +++ b/src/opds_dumper.cpp @@ -216,7 +216,7 @@ string OPDSDumper::dumpOPDSFeedV2(const std::vector& bookIds, const {"endpoint_root", endpointRoot}, {"feed_id", gen_uuid(libraryId + endpoint + "?" + query)}, {"filter", onlyAsNonEmptyMustacheValue(query)}, - {"query", query.empty() ? "" : "?" + urlEncode(query)}, + {"query", query.empty() ? "" : "?" + query}, {"totalResults", to_string(m_totalResults)}, {"startIndex", to_string(m_startIndex)}, {"itemsPerPage", to_string(m_count)}, diff --git a/src/server/request_context.cpp b/src/server/request_context.cpp index 272c7f737..f5a48c77c 100644 --- a/src/server/request_context.cpp +++ b/src/server/request_context.cpp @@ -116,10 +116,10 @@ MHD_Result RequestContext::fill_argument(void *__this, enum MHD_ValueKind kind, if ( ! _this->queryString.empty() ) { _this->queryString += "&"; } - _this->queryString += key; + _this->queryString += urlEncode(key, true); if ( value ) { _this->queryString += "="; - _this->queryString += value; + _this->queryString += urlEncode(value, true); } return MHD_YES; } diff --git a/test/library_server.cpp b/test/library_server.cpp index f61d94281..a8f349ef0 100644 --- a/test/library_server.cpp +++ b/test/library_server.cpp @@ -193,7 +193,7 @@ TEST_F(LibraryServerTest, catalog_search_by_phrase) EXPECT_EQ(maskVariableOPDSFeedData(r->body), OPDS_FEED_TAG " 12345678-90ab-cdef-1234-567890abcdef\n" - " Filtered zims (q="ray charles")\n" + " Filtered zims (q=%22ray%20charles%22)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 2\n" " 0\n" @@ -212,7 +212,7 @@ TEST_F(LibraryServerTest, catalog_search_by_words) EXPECT_EQ(maskVariableOPDSFeedData(r->body), OPDS_FEED_TAG " 12345678-90ab-cdef-1234-567890abcdef\n" - " Filtered zims (q=ray charles)\n" + " Filtered zims (q=ray%20charles)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 3\n" " 0\n" @@ -233,7 +233,7 @@ TEST_F(LibraryServerTest, catalog_prefix_search) EXPECT_EQ(maskVariableOPDSFeedData(r->body), OPDS_FEED_TAG " 12345678-90ab-cdef-1234-567890abcdef\n" - " Filtered zims (q=description:ray description:charles)\n" + " Filtered zims (q=description%3Aray%20description%3Acharles)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 2\n" " 0\n" @@ -250,7 +250,7 @@ TEST_F(LibraryServerTest, catalog_prefix_search) EXPECT_EQ(maskVariableOPDSFeedData(r->body), OPDS_FEED_TAG " 12345678-90ab-cdef-1234-567890abcdef\n" - " Filtered zims (q=title:"ray charles")\n" + " Filtered zims (q=title%3A%22ray%20charles%22)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 1\n" " 0\n" @@ -269,7 +269,7 @@ TEST_F(LibraryServerTest, catalog_search_with_word_exclusion) EXPECT_EQ(maskVariableOPDSFeedData(r->body), OPDS_FEED_TAG " 12345678-90ab-cdef-1234-567890abcdef\n" - " Filtered zims (q=ray -uncategorized)\n" + " Filtered zims (q=ray%20-uncategorized)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 2\n" " 0\n" @@ -288,7 +288,7 @@ TEST_F(LibraryServerTest, catalog_search_by_tag) EXPECT_EQ(maskVariableOPDSFeedData(r->body), OPDS_FEED_TAG " 12345678-90ab-cdef-1234-567890abcdef\n" - " Filtered zims (tag=_category:jazz)\n" + " Filtered zims (tag=_category%3Ajazz)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 1\n" " 0\n" @@ -342,7 +342,7 @@ TEST_F(LibraryServerTest, catalog_search_by_language) EXPECT_EQ(maskVariableOPDSFeedData(r->body), OPDS_FEED_TAG " 12345678-90ab-cdef-1234-567890abcdef\n" - " Filtered zims (lang=eng,fra)\n" + " Filtered zims (lang=eng%2Cfra)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 2\n" " 0\n" @@ -694,7 +694,7 @@ TEST_F(LibraryServerTest, catalog_v2_entries_filtered_by_search_terms) EXPECT_EQ(r->status, 200); EXPECT_EQ(maskVariableOPDSFeedData(r->body), CATALOG_V2_ENTRIES_PREAMBLE("?q=%22ray%20charles%22") - " Filtered Entries (q="ray charles")\n" + " Filtered Entries (q=%22ray%20charles%22)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 2\n" " 0\n" @@ -726,8 +726,8 @@ TEST_F(LibraryServerTest, catalog_v2_entries_filtered_by_language) const auto r = zfs1_->GET("/ROOT/catalog/v2/entries?lang=eng,fra"); EXPECT_EQ(r->status, 200); EXPECT_EQ(maskVariableOPDSFeedData(r->body), - CATALOG_V2_ENTRIES_PREAMBLE("?lang=eng,fra") - " Filtered Entries (lang=eng,fra)\n" + CATALOG_V2_ENTRIES_PREAMBLE("?lang=eng%2Cfra") + " Filtered Entries (lang=eng%2Cfra)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 2\n" " 0\n" @@ -865,7 +865,7 @@ TEST_F(LibraryServerTest, no_name_mapper_returned_catalog_use_uuid_in_link) EXPECT_EQ(maskVariableOPDSFeedData(r->body), OPDS_FEED_TAG " 12345678-90ab-cdef-1234-567890abcdef\n" - " Filtered zims (tag=_category:jazz)\n" + " Filtered zims (tag=_category%3Ajazz)\n" " YYYY-MM-DDThh:mm:ssZ\n" " 1\n" " 0\n" diff --git a/test/server.cpp b/test/server.cpp index 29f1885b1..1086dfe5b 100644 --- a/test/server.cpp +++ b/test/server.cpp @@ -827,7 +827,7 @@ TEST_F(ServerTest, Http400HtmlError) expected_body==R"(

Invalid request

- The requested URL "/ROOT/search?content=non-existing-book&pattern=a"<script foo>" is not a valid request. + The requested URL "/ROOT/search?content=non-existing-book&pattern=a%22%3Cscript%20foo%3E" is not a valid request.

No such book: non-existing-book @@ -910,7 +910,7 @@ TEST_F(ServerTest, HttpXmlError) /* HTTP status code */ 400, /* expected response XML */ R"( Invalid request -The requested URL "/ROOT/search?format=xml&content=non-existing-book&pattern=a"<script foo>" is not a valid request. +The requested URL "/ROOT/search?format=xml&content=non-existing-book&pattern=a%22%3Cscript%20foo%3E" is not a valid request. No such book: non-existing-book )" }, // There is a flaw in our way to handle query string, we cannot differenciate From 82dcba542a2dcb65a3a6efbd4ea96fb949b4a383 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 17:10:48 +0400 Subject: [PATCH 12/16] Demonstrating bugs of kiwix::getSearchUrl() --- test/opds_catalog.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/opds_catalog.cpp b/test/opds_catalog.cpp index 6647d5a32..dae777e06 100644 --- a/test/opds_catalog.cpp +++ b/test/opds_catalog.cpp @@ -37,34 +37,34 @@ TEST(OpdsCatalog, getSearchUrl) } { Filter f; - f.query("abc def"); - EXPECT_SEARCH_URL("/catalog/v2/entries?q=abc%20def"); + f.query("abc def#xyz"); + EXPECT_SEARCH_URL("/catalog/v2/entries?q=abc%20def#xyz"); } { Filter f; - f.category("ted"); - EXPECT_SEARCH_URL("/catalog/v2/entries?category=ted"); + f.category("ted&bob"); + EXPECT_SEARCH_URL("/catalog/v2/entries?category=ted&bob"); } { Filter f; - f.lang("eng"); - EXPECT_SEARCH_URL("/catalog/v2/entries?lang=eng"); + f.lang("eng,fra"); + EXPECT_SEARCH_URL("/catalog/v2/entries?lang=eng,fra"); } { Filter f; - f.name("second"); - EXPECT_SEARCH_URL("/catalog/v2/entries?name=second"); + f.name("second?"); + EXPECT_SEARCH_URL("/catalog/v2/entries?name=second?"); } { Filter f; - f.acceptTags({"paper", "plastic"}); - EXPECT_SEARCH_URL("/catalog/v2/entries?tag=paper;plastic"); + f.acceptTags({"#paper", "#plastic"}); + EXPECT_SEARCH_URL("/catalog/v2/entries?tag=#paper;#plastic"); } { Filter f; - f.query("abc"); - f.category("ted"); - EXPECT_SEARCH_URL("/catalog/v2/entries?q=abc&category=ted"); + f.query("abc=123"); + f.category("@ted"); + EXPECT_SEARCH_URL("/catalog/v2/entries?q=abc=123&category=@ted"); } { Filter f; From ec81d5904dcdf97a09eec0d538c81366f15fe8ff Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 17:16:43 +0400 Subject: [PATCH 13/16] Proper URI-encoding in kiwix::getSearchUrl() --- src/opds_catalog.cpp | 10 +++++----- test/opds_catalog.cpp | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/opds_catalog.cpp b/src/opds_catalog.cpp index ee7dacb3e..32dfad618 100644 --- a/src/opds_catalog.cpp +++ b/src/opds_catalog.cpp @@ -42,19 +42,19 @@ std::string buildSearchString(const Filter& f) { std::ostringstream oss; if ( f.hasQuery() ) - oss << AMP << "q=" << urlEncode(f.getQuery()); + oss << AMP << "q=" << urlEncode(f.getQuery(), true); if ( f.hasCategory() ) - oss << AMP << "category=" << urlEncode(f.getCategory()); + oss << AMP << "category=" << urlEncode(f.getCategory(), true); if ( f.hasLang() ) - oss << AMP << "lang=" << urlEncode(f.getLang()); + oss << AMP << "lang=" << urlEncode(f.getLang(), true); if ( f.hasName() ) - oss << AMP << "name=" << urlEncode(f.getName()); + oss << AMP << "name=" << urlEncode(f.getName(), true); if ( !f.getAcceptTags().empty() ) - oss << AMP << "tag=" << urlEncode(join(f.getAcceptTags(), ";")); + oss << AMP << "tag=" << urlEncode(join(f.getAcceptTags(), ";"), true); return oss.str(); } diff --git a/test/opds_catalog.cpp b/test/opds_catalog.cpp index dae777e06..a1936a19b 100644 --- a/test/opds_catalog.cpp +++ b/test/opds_catalog.cpp @@ -38,33 +38,33 @@ TEST(OpdsCatalog, getSearchUrl) { Filter f; f.query("abc def#xyz"); - EXPECT_SEARCH_URL("/catalog/v2/entries?q=abc%20def#xyz"); + EXPECT_SEARCH_URL("/catalog/v2/entries?q=abc%20def%23xyz"); } { Filter f; f.category("ted&bob"); - EXPECT_SEARCH_URL("/catalog/v2/entries?category=ted&bob"); + EXPECT_SEARCH_URL("/catalog/v2/entries?category=ted%26bob"); } { Filter f; f.lang("eng,fra"); - EXPECT_SEARCH_URL("/catalog/v2/entries?lang=eng,fra"); + EXPECT_SEARCH_URL("/catalog/v2/entries?lang=eng%2Cfra"); } { Filter f; f.name("second?"); - EXPECT_SEARCH_URL("/catalog/v2/entries?name=second?"); + EXPECT_SEARCH_URL("/catalog/v2/entries?name=second%3F"); } { Filter f; f.acceptTags({"#paper", "#plastic"}); - EXPECT_SEARCH_URL("/catalog/v2/entries?tag=#paper;#plastic"); + EXPECT_SEARCH_URL("/catalog/v2/entries?tag=%23paper%3B%23plastic"); } { Filter f; f.query("abc=123"); f.category("@ted"); - EXPECT_SEARCH_URL("/catalog/v2/entries?q=abc=123&category=@ted"); + EXPECT_SEARCH_URL("/catalog/v2/entries?q=abc%3D123&category=%40ted"); } { Filter f; @@ -79,7 +79,7 @@ TEST(OpdsCatalog, getSearchUrl) f.lang("html"); f.name("edsonarantesdonascimento"); f.acceptTags({"body", "script"}); - EXPECT_SEARCH_URL("/catalog/v2/entries?q=peru&category=scifi&lang=html&name=edsonarantesdonascimento&tag=body;script"); + EXPECT_SEARCH_URL("/catalog/v2/entries?q=peru&category=scifi&lang=html&name=edsonarantesdonascimento&tag=body%3Bscript"); } #undef EXPECT_SEARCH_URL } From 3bf8211b700ff3ab2470b0009bc4d57de82afc79 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 18:49:51 +0400 Subject: [PATCH 14/16] Made 2nd param of urlEncode() mandatory This is a precautionary step before dropping the said parameter. --- src/tools/stringTools.h | 2 +- test/stringTools.cpp | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/tools/stringTools.h b/src/tools/stringTools.h index 3de1f086b..2f67c1917 100644 --- a/src/tools/stringTools.h +++ b/src/tools/stringTools.h @@ -55,7 +55,7 @@ private: }; -std::string urlEncode(const std::string& value, bool encodeReserved = false); +std::string urlEncode(const std::string& value, bool encodeReserved); std::string urlDecode(const std::string& value, bool component = false); std::string join(const std::vector& list, const std::string& sep); diff --git a/test/stringTools.cpp b/test/stringTools.cpp index 102e9bf0d..2fa0e4c88 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -125,26 +125,26 @@ TEST(stringTools, urlEncode) { using namespace URLEncoding; - EXPECT_EQ(urlEncode(letters), letters); + EXPECT_EQ(urlEncode(letters,false), letters); EXPECT_EQ(urlEncode(letters, true), letters); - EXPECT_EQ(urlEncode(digits), digits); + EXPECT_EQ(urlEncode(digits,false), digits); EXPECT_EQ(urlEncode(digits, true), digits); - EXPECT_EQ(urlEncode(nonEncodableSymbols), nonEncodableSymbols); + EXPECT_EQ(urlEncode(nonEncodableSymbols,false), nonEncodableSymbols); EXPECT_EQ(urlEncode(nonEncodableSymbols, true), nonEncodableSymbols); - EXPECT_EQ(urlEncode(uriDelimSymbols), uriDelimSymbols); + EXPECT_EQ(urlEncode(uriDelimSymbols,false), uriDelimSymbols); EXPECT_EQ(urlEncode(uriDelimSymbols, true), "%3A%40%3F%3D%2B%26%23%24%3B%2C"); - EXPECT_EQ(urlEncode(otherSymbols), "%60%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); - EXPECT_EQ(urlEncode(otherSymbols), urlEncode(otherSymbols, true)); + EXPECT_EQ(urlEncode(otherSymbols,false), "%60%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); + EXPECT_EQ(urlEncode(otherSymbols,false), urlEncode(otherSymbols, true)); - EXPECT_EQ(urlEncode(whitespace), "%20%0A%09%0D"); - EXPECT_EQ(urlEncode(whitespace), urlEncode(whitespace, true)); + EXPECT_EQ(urlEncode(whitespace,false), "%20%0A%09%0D"); + EXPECT_EQ(urlEncode(whitespace,false), urlEncode(whitespace, true)); - EXPECT_EQ(urlEncode(someNonASCIIChars), "%CE%A3%E2%99%82%E2%99%80%E3%83%84"); - EXPECT_EQ(urlEncode(someNonASCIIChars), urlEncode(someNonASCIIChars, true)); + EXPECT_EQ(urlEncode(someNonASCIIChars,false), "%CE%A3%E2%99%82%E2%99%80%E3%83%84"); + EXPECT_EQ(urlEncode(someNonASCIIChars,false), urlEncode(someNonASCIIChars, true)); } TEST(stringTools, urlDecode) @@ -161,11 +161,12 @@ TEST(stringTools, urlDecode) for ( const char c : allTestChars ) { const std::string str(1, c); - EXPECT_EQ(urlDecode(urlEncode(str)), str); + EXPECT_EQ(urlDecode(urlEncode(str,false),false), str); EXPECT_EQ(urlDecode(urlEncode(str, true), true), str); + EXPECT_EQ(urlDecode(urlEncode(str,false)), str); } - EXPECT_EQ(urlDecode(urlEncode(allTestChars)), allTestChars); + EXPECT_EQ(urlDecode(urlEncode(allTestChars,false),false), allTestChars); EXPECT_EQ(urlDecode(urlEncode(allTestChars, true), true), allTestChars); } From 471c5b89f4440bf2002028b26f34c4fa8802d732 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 19:04:52 +0400 Subject: [PATCH 15/16] Dropped the 2nd param of urlEncode() `urlEncode(str)` is now equivalent to the previous `urlEncode(str, true)`. --- src/opds_catalog.cpp | 10 +++++----- src/opds_dumper.cpp | 4 ++-- src/search_renderer.cpp | 4 ++-- src/server/internalServer.cpp | 6 +++--- src/server/request_context.cpp | 4 ++-- src/server/request_context.h | 2 +- src/tools/stringTools.cpp | 7 ++----- src/tools/stringTools.h | 4 +++- test/stringTools.cpp | 31 ++++++++++++------------------- 9 files changed, 32 insertions(+), 40 deletions(-) diff --git a/src/opds_catalog.cpp b/src/opds_catalog.cpp index 32dfad618..ee7dacb3e 100644 --- a/src/opds_catalog.cpp +++ b/src/opds_catalog.cpp @@ -42,19 +42,19 @@ std::string buildSearchString(const Filter& f) { std::ostringstream oss; if ( f.hasQuery() ) - oss << AMP << "q=" << urlEncode(f.getQuery(), true); + oss << AMP << "q=" << urlEncode(f.getQuery()); if ( f.hasCategory() ) - oss << AMP << "category=" << urlEncode(f.getCategory(), true); + oss << AMP << "category=" << urlEncode(f.getCategory()); if ( f.hasLang() ) - oss << AMP << "lang=" << urlEncode(f.getLang(), true); + oss << AMP << "lang=" << urlEncode(f.getLang()); if ( f.hasName() ) - oss << AMP << "name=" << urlEncode(f.getName(), true); + oss << AMP << "name=" << urlEncode(f.getName()); if ( !f.getAcceptTags().empty() ) - oss << AMP << "tag=" << urlEncode(join(f.getAcceptTags(), ";"), true); + oss << AMP << "tag=" << urlEncode(join(f.getAcceptTags(), ";")); return oss.str(); } diff --git a/src/opds_dumper.cpp b/src/opds_dumper.cpp index d4719b4a8..bea9ab9df 100644 --- a/src/opds_dumper.cpp +++ b/src/opds_dumper.cpp @@ -82,7 +82,7 @@ std::string fullEntryXML(const Book& book, const std::string& rootLocation, cons {"title", book.getTitle()}, {"description", book.getDescription()}, {"language", book.getLanguage()}, - {"content_id", urlEncode(contentId, true)}, + {"content_id", urlEncode(contentId)}, {"updated", bookDate}, // XXX: this should be the entry update datetime {"book_date", bookDate}, {"category", book.getCategory()}, @@ -241,7 +241,7 @@ std::string OPDSDumper::categoriesOPDSFeed() const const auto now = gen_date_str(); kainjow::mustache::list categoryData; for ( const auto& category : library->getBooksCategories() ) { - const auto urlencodedCategoryName = urlEncode(category, true); + const auto urlencodedCategoryName = urlEncode(category); categoryData.push_back(kainjow::mustache::object{ {"name", category}, {"urlencoded_name", urlencodedCategoryName}, diff --git a/src/search_renderer.cpp b/src/search_renderer.cpp index 381354bc6..c47b93368 100644 --- a/src/search_renderer.cpp +++ b/src/search_renderer.cpp @@ -94,7 +94,7 @@ kainjow::mustache::data buildQueryData kainjow::mustache::data query; query.set("pattern", kiwix::encodeDiples(pattern)); std::ostringstream ss; - ss << searchProtocolPrefix << "?pattern=" << urlEncode(pattern, true); + ss << searchProtocolPrefix << "?pattern=" << urlEncode(pattern); ss << "&" << bookQuery; query.set("unpaginatedQuery", ss.str()); auto lang = extractValueFromQuery(bookQuery, "books.filter.lang"); @@ -174,7 +174,7 @@ std::string SearchRenderer::renderTemplate(const std::string& tmpl_str) const std::string zim_id(it.getZimId()); const auto path = mp_nameMapper->getNameForId(zim_id) + "/" + it.getPath(); result.set("title", it.getTitle()); - result.set("absolutePath", absPathPrefix + urlEncode(path, true)); + result.set("absolutePath", absPathPrefix + urlEncode(path)); result.set("snippet", it.getSnippet()); if (mp_library) { result.set("bookTitle", mp_library->getBookById(zim_id).getTitle()); diff --git a/src/server/internalServer.cpp b/src/server/internalServer.cpp index 0edb367fc..8548d4245 100644 --- a/src/server/internalServer.cpp +++ b/src/server/internalServer.cpp @@ -1030,7 +1030,7 @@ ParameterizedMessage suggestSearchMsg(const std::string& searchURL, const std::s std::unique_ptr InternalServer::build_redirect(const std::string& bookName, const zim::Item& item) const { - const auto path = kiwix::urlEncode(item.getPath(), true); + const auto path = kiwix::urlEncode(item.getPath()); const auto redirectUrl = m_root + "/content/" + bookName + "/" + path; return Response::build_redirect(*this, redirectUrl); } @@ -1055,7 +1055,7 @@ std::unique_ptr InternalServer::handle_content(const RequestContext& r } catch (const std::out_of_range& e) {} if (archive == nullptr) { - const std::string searchURL = m_root + "/search?pattern=" + kiwix::urlEncode(pattern, true); + const std::string searchURL = m_root + "/search?pattern=" + kiwix::urlEncode(pattern); return HTTP404Response(*this, request) + urlNotFoundMsg + suggestSearchMsg(searchURL, kiwix::urlDecode(pattern)); @@ -1096,7 +1096,7 @@ std::unique_ptr InternalServer::handle_content(const RequestContext& r if (m_verbose.load()) printf("Failed to find %s\n", urlStr.c_str()); - std::string searchURL = m_root + "/search?content=" + bookName + "&pattern=" + kiwix::urlEncode(pattern, true); + std::string searchURL = m_root + "/search?content=" + bookName + "&pattern=" + kiwix::urlEncode(pattern); return HTTP404Response(*this, request) + urlNotFoundMsg + suggestSearchMsg(searchURL, kiwix::urlDecode(pattern)); diff --git a/src/server/request_context.cpp b/src/server/request_context.cpp index f5a48c77c..d418d75af 100644 --- a/src/server/request_context.cpp +++ b/src/server/request_context.cpp @@ -116,10 +116,10 @@ MHD_Result RequestContext::fill_argument(void *__this, enum MHD_ValueKind kind, if ( ! _this->queryString.empty() ) { _this->queryString += "&"; } - _this->queryString += urlEncode(key, true); + _this->queryString += urlEncode(key); if ( value ) { _this->queryString += "="; - _this->queryString += urlEncode(value, true); + _this->queryString += urlEncode(value); } return MHD_YES; } diff --git a/src/server/request_context.h b/src/server/request_context.h index 689e5445d..9017a6b28 100644 --- a/src/server/request_context.h +++ b/src/server/request_context.h @@ -99,7 +99,7 @@ class RequestContext { std::string get_query(F filter, bool mustEncode) const { std::string q; const char* sep = ""; - auto encode = [=](const std::string& value) { return mustEncode?urlEncode(value, true):value; }; + auto encode = [=](const std::string& value) { return mustEncode?urlEncode(value):value; }; for ( const auto& a : arguments ) { if (!filter(a.first)) { continue; diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index 3477aaaee..a06a8fcf5 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -164,9 +164,6 @@ std::string kiwix::encodeDiples(const std::string& str) namespace { -/* urlEncode() based on javascript encodeURI() & - encodeURIComponent(). Mostly code from rstudio/httpuv (GPLv3) */ - bool isReservedUrlChar(char c) { switch (c) { @@ -246,7 +243,7 @@ int hexToInt(char c) { } // unnamed namespace -std::string kiwix::urlEncode(const std::string& value, bool encodeReserved) +std::string kiwix::urlEncode(const std::string& value) { std::ostringstream os; os << std::hex << std::uppercase; @@ -254,7 +251,7 @@ std::string kiwix::urlEncode(const std::string& value, bool encodeReserved) it != value.end(); it++) { - if (!needsEscape(*it, encodeReserved)) { + if (!needsEscape(*it, true)) { os << *it; } else { const unsigned int charVal = static_cast(*it); diff --git a/src/tools/stringTools.h b/src/tools/stringTools.h index 2f67c1917..1f55a22bc 100644 --- a/src/tools/stringTools.h +++ b/src/tools/stringTools.h @@ -55,7 +55,9 @@ private: }; -std::string urlEncode(const std::string& value, bool encodeReserved); +/* urlEncode() is the equivalent of JS encodeURIComponent(), with the only + * difference that the slash (/) symbol is NOT encoded. */ +std::string urlEncode(const std::string& value); std::string urlDecode(const std::string& value, bool component = false); std::string join(const std::vector& list, const std::string& sep); diff --git a/test/stringTools.cpp b/test/stringTools.cpp index 2fa0e4c88..27cc712b8 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -125,26 +125,19 @@ TEST(stringTools, urlEncode) { using namespace URLEncoding; - EXPECT_EQ(urlEncode(letters,false), letters); - EXPECT_EQ(urlEncode(letters, true), letters); + EXPECT_EQ(urlEncode(letters), letters); - EXPECT_EQ(urlEncode(digits,false), digits); - EXPECT_EQ(urlEncode(digits, true), digits); + EXPECT_EQ(urlEncode(digits), digits); - EXPECT_EQ(urlEncode(nonEncodableSymbols,false), nonEncodableSymbols); - EXPECT_EQ(urlEncode(nonEncodableSymbols, true), nonEncodableSymbols); + EXPECT_EQ(urlEncode(nonEncodableSymbols), nonEncodableSymbols); - EXPECT_EQ(urlEncode(uriDelimSymbols,false), uriDelimSymbols); - EXPECT_EQ(urlEncode(uriDelimSymbols, true), "%3A%40%3F%3D%2B%26%23%24%3B%2C"); + EXPECT_EQ(urlEncode(uriDelimSymbols), "%3A%40%3F%3D%2B%26%23%24%3B%2C"); - EXPECT_EQ(urlEncode(otherSymbols,false), "%60%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); - EXPECT_EQ(urlEncode(otherSymbols,false), urlEncode(otherSymbols, true)); + EXPECT_EQ(urlEncode(otherSymbols), "%60%25%5E%5B%5D%7B%7D%5C%7C%22%3C%3E"); - EXPECT_EQ(urlEncode(whitespace,false), "%20%0A%09%0D"); - EXPECT_EQ(urlEncode(whitespace,false), urlEncode(whitespace, true)); + EXPECT_EQ(urlEncode(whitespace), "%20%0A%09%0D"); - EXPECT_EQ(urlEncode(someNonASCIIChars,false), "%CE%A3%E2%99%82%E2%99%80%E3%83%84"); - EXPECT_EQ(urlEncode(someNonASCIIChars,false), urlEncode(someNonASCIIChars, true)); + EXPECT_EQ(urlEncode(someNonASCIIChars), "%CE%A3%E2%99%82%E2%99%80%E3%83%84"); } TEST(stringTools, urlDecode) @@ -161,13 +154,13 @@ TEST(stringTools, urlDecode) for ( const char c : allTestChars ) { const std::string str(1, c); - EXPECT_EQ(urlDecode(urlEncode(str,false),false), str); - EXPECT_EQ(urlDecode(urlEncode(str, true), true), str); - EXPECT_EQ(urlDecode(urlEncode(str,false)), str); + EXPECT_EQ(urlDecode(urlEncode(str), true), str); } - EXPECT_EQ(urlDecode(urlEncode(allTestChars,false),false), allTestChars); - EXPECT_EQ(urlDecode(urlEncode(allTestChars, true), true), allTestChars); + EXPECT_EQ(urlDecode(urlEncode(allTestChars), true), allTestChars); + + const std::string encodedUriDelimSymbols = urlEncode(uriDelimSymbols); + EXPECT_EQ(urlDecode(encodedUriDelimSymbols, false), encodedUriDelimSymbols); } }; From ca079a72cc730d3dce15e61515fc90d12be9e992 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 25 Jan 2023 19:09:16 +0400 Subject: [PATCH 16/16] Some clean-up --- src/tools/stringTools.cpp | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index a06a8fcf5..3ca8124aa 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -208,17 +208,6 @@ bool isHarmlessUriChar(char c) return false; } -bool needsEscape(char c, bool encodeReserved) -{ - if (isHarmlessUriChar(c)) - return false; - - if (isReservedUrlChar(c)) - return encodeReserved; - - return true; -} - int hexToInt(char c) { switch (c) { case '0': return 0; @@ -247,14 +236,11 @@ std::string kiwix::urlEncode(const std::string& value) { std::ostringstream os; os << std::hex << std::uppercase; - for (std::string::const_iterator it = value.begin(); - it != value.end(); - it++) { - - if (!needsEscape(*it, true)) { - os << *it; + for (const char c : value) { + if (isHarmlessUriChar(c)) { + os << c; } else { - const unsigned int charVal = static_cast(*it); + const unsigned int charVal = static_cast(c); os << '%' << std::setw(2) << std::setfill('0') << charVal; } }