From 4407dd12bd6b1abca915ee225c741c5739fc831d Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 28 Oct 2020 14:08:06 +0100 Subject: [PATCH 1/5] Move mimetypeCounter parsing in its own function. --- include/tools/otherTools.h | 4 ++++ src/reader.cpp | 21 ++++----------------- src/tools/otherTools.cpp | 21 +++++++++++++++++++++ 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/include/tools/otherTools.h b/include/tools/otherTools.h index 2f95f3fc2..7f80dbb9a 100644 --- a/include/tools/otherTools.h +++ b/include/tools/otherTools.h @@ -22,6 +22,8 @@ #include #include +#include +#include namespace pugi { class xml_node; @@ -41,6 +43,8 @@ namespace kiwix const std::string& tagName); bool convertStrToBool(const std::string& value); + using MimeCounterType = std::map; + MimeCounterType parseMimetypeCounter(const std::string& counterData); } #endif diff --git a/src/reader.cpp b/src/reader.cpp index 8d6b5a471..bda070cc3 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -103,29 +103,16 @@ zim::File* Reader::getZimFileHandler() const { return this->zimFileHandler; } -std::map Reader::parseCounterMetadata() const -{ - std::map counters; - string mimeType, item, counterString; - unsigned int counter; +MimeCounterType Reader::parseCounterMetadata() const +{ zim::Article article = this->zimFileHandler->getArticle('M', "Counter"); if (article.good()) { - stringstream ssContent(article.getData()); - - while (getline(ssContent, item, ';')) { - stringstream ssItem(item); - getline(ssItem, mimeType, '='); - getline(ssItem, counterString, '='); - if (!counterString.empty() && !mimeType.empty()) { - sscanf(counterString.c_str(), "%u", &counter); - counters.insert(pair(mimeType, counter)); - } - } + return parseMimetypeCounter(article.getData()); } - return counters; + return MimeCounterType(); } /* Get the count of articles which can be indexed/displayed */ diff --git a/src/tools/otherTools.cpp b/src/tools/otherTools.cpp index e4ae850be..7d61f7a83 100644 --- a/src/tools/otherTools.cpp +++ b/src/tools/otherTools.cpp @@ -280,3 +280,24 @@ bool kiwix::convertStrToBool(const std::string& value) throw std::domain_error(ss.str()); } + +kiwix::MimeCounterType kiwix::parseMimetypeCounter(const std::string& counterData) +{ + kiwix::MimeCounterType counters; + std::string mimeType, item, counterString; + unsigned int counter; + + std::stringstream ssContent(counterData); + + while (getline(ssContent, item, ';')) { + std::stringstream ssItem(item); + getline(ssItem, mimeType, '='); + getline(ssItem, counterString, '='); + if (!counterString.empty() && !mimeType.empty()) { + sscanf(counterString.c_str(), "%u", &counter); + counters.insert(std::pair(mimeType, counter)); + } + } + + return counters; +} From ef42abea4bc4dcaf22fa54dd6dc197a62865e79b Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 28 Oct 2020 14:44:23 +0100 Subject: [PATCH 2/5] Add some tests of `parseMimetypeCounter` --- src/tools/otherTools.cpp | 7 +-- test/counterParsing.cpp | 114 +++++++++++++++++++++++++++++++++++++++ test/meson.build | 1 + 3 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 test/counterParsing.cpp diff --git a/src/tools/otherTools.cpp b/src/tools/otherTools.cpp index 7d61f7a83..2cbf2c6c4 100644 --- a/src/tools/otherTools.cpp +++ b/src/tools/otherTools.cpp @@ -284,18 +284,19 @@ bool kiwix::convertStrToBool(const std::string& value) kiwix::MimeCounterType kiwix::parseMimetypeCounter(const std::string& counterData) { kiwix::MimeCounterType counters; - std::string mimeType, item, counterString; + std::string item; unsigned int counter; std::stringstream ssContent(counterData); while (getline(ssContent, item, ';')) { + std::string mimeType, counterString; std::stringstream ssItem(item); getline(ssItem, mimeType, '='); getline(ssItem, counterString, '='); if (!counterString.empty() && !mimeType.empty()) { - sscanf(counterString.c_str(), "%u", &counter); - counters.insert(std::pair(mimeType, counter)); + if (sscanf(counterString.c_str(), "%u", &counter)) + counters.insert(std::pair(mimeType, counter)); } } diff --git a/test/counterParsing.cpp b/test/counterParsing.cpp new file mode 100644 index 000000000..bb17b3b32 --- /dev/null +++ b/test/counterParsing.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2019 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "gtest/gtest.h" +#include +#include +#include +#include + +namespace kiwix { +using CounterType = std::map; +CounterType parseMimetypeCounter(const std::string& counterData); +}; + +using namespace kiwix; +#define parse parseMimetypeCounter + +namespace +{ +TEST(ParseCounterTest, simpleMimeType) +{ + { + std::string counterStr = ""; + CounterType counterMap = {}; + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; + } + { + std::string counterStr = "foo=1"; + CounterType counterMap = {{"foo", 1}}; + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; + } + { + std::string counterStr = "foo=1;text/html=50;"; + CounterType counterMap = {{"foo", 1}, {"text/html", 50}}; + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; + } +} +/* +TEST(ParseCounterTest, paramMimeType) +{ + { + std::string counterStr = "text/html;raw=true=1"; + CounterType counterMap = {{"foo", 1}}; + ASSERT_EQ(parse(counterStr), counterMap); + } + { + std::string counterStr = "foo=1;text/html;raw=true=50;bar=2"; + CounterType counterMap = {{"foo", 1}, {"text/html;raw=true", 50}, {"bar", 2}}; + ASSERT_EQ(parse(counterStr), counterMap); + } + { + std::string counterStr = "foo=1;text/html;raw=true;param=value=50;bar=2"; + CounterType counterMap = {{"foo", 1}, {"text/html;raw=true;param=value", 50}, {"bar", 2}}; + ASSERT_EQ(parse(counterStr), counterMap); + } + { + std::string counterStr = "foo=1;text/html;raw=true=50;bar=2"; + CounterType counterMap = {{"foo", 1}, {"text/html;raw=true", 50}, {"bar", 2}}; + ASSERT_EQ(parse(counterStr), counterMap); + } +}*/ + +TEST(ParseCounterTest, wrongType) +{ + CounterType empty = {}; + { + std::string counterStr = "text/html"; + ASSERT_EQ(parse(counterStr), empty) << counterStr; + } + { + std::string counterStr = "text/html="; + ASSERT_EQ(parse(counterStr), empty) << counterStr; + } + { + std::string counterStr = "text/html=foo"; + ASSERT_EQ(parse(counterStr), empty) << counterStr; + } + { + std::string counterStr = "text/html=50;foo"; + CounterType counterMap = {{"text/html", 50}}; + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; + } + /*{ + std::string counterStr = "text/html;foo=20"; + ASSERT_EQ(parse(counterStr), empty) << counterStr; + } + { + std::string counterStr = "text/html;foo=20;"; + ASSERT_EQ(parse(counterStr), empty) << counterStr; + }*/ + { + std::string counterStr = "text/html=50;;foo"; + CounterType counterMap = {{"text/html", 50}}; + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; + } +} + +}; diff --git a/test/meson.build b/test/meson.build index e203bc74d..79849fe51 100644 --- a/test/meson.build +++ b/test/meson.build @@ -3,6 +3,7 @@ tests = [ 'library', 'regex', 'tagParsing', + 'counterParsing', 'stringTools', 'pathTools', 'kiwixserve', From 08464f23bc0756fc99e2e1559f95d325f52fba87 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 28 Oct 2020 15:58:35 +0100 Subject: [PATCH 3/5] Better parsing of `M/Counter` Mimetype may contain a parameters. Then, the mimetype would be something like "text/html;foo=bar;foz=baz" It will contains a `;` and `=` and it conflicts with the same operators we use to separate the items in our list. We have to use a more advanced algorithm which takes the context into account. Fix #416 --- include/tools/stringTools.h | 2 +- src/tools/otherTools.cpp | 52 ++++++++++++++++++++++++++++--------- src/tools/stringTools.cpp | 6 ++++- test/counterParsing.cpp | 43 +++++++++++++++++++++++------- test/stringTools.cpp | 23 +++++++++------- 5 files changed, 94 insertions(+), 32 deletions(-) diff --git a/include/tools/stringTools.h b/include/tools/stringTools.h index 02868f7c6..8c9a8bc9d 100644 --- a/include/tools/stringTools.h +++ b/include/tools/stringTools.h @@ -43,7 +43,7 @@ void loadICUExternalTables(); std::string urlEncode(const std::string& value, bool encodeReserved = false); std::string urlDecode(const std::string& value, bool component = false); -std::vector split(const std::string&, const std::string&, bool trimEmpty = true); +std::vector split(const std::string& str, const std::string& delims, bool trimEmpty = true, bool keepDelim = false); std::string join(const std::vector& list, const std::string& sep); std::string ucAll(const std::string& word); diff --git a/src/tools/otherTools.cpp b/src/tools/otherTools.cpp index 2cbf2c6c4..63f10c306 100644 --- a/src/tools/otherTools.cpp +++ b/src/tools/otherTools.cpp @@ -280,24 +280,52 @@ bool kiwix::convertStrToBool(const std::string& value) throw std::domain_error(ss.str()); } - +#define get_token() if (currentIt==tokens.end()) {break;} else { token = *currentIt++; } kiwix::MimeCounterType kiwix::parseMimetypeCounter(const std::string& counterData) { + // The counter metadata format is a list of item separated by a `;` : + // item0;item1;item2 + // Each item is a "tuple" mimetype=number. + // However, the mimetype may contains parameters: + // text/html;raw=true;foo=bar + // So the final format may be complex to parse: + // key0=value0;key1;foo=bar=value1;key2=value2 + kiwix::MimeCounterType counters; - std::string item; - unsigned int counter; - std::stringstream ssContent(counterData); + auto tokens = split(counterData, ";=", true, true); + auto currentIt = tokens.begin(); + std::string token; - while (getline(ssContent, item, ';')) { - std::string mimeType, counterString; - std::stringstream ssItem(item); - getline(ssItem, mimeType, '='); - getline(ssItem, counterString, '='); - if (!counterString.empty() && !mimeType.empty()) { - if (sscanf(counterString.c_str(), "%u", &counter)) - counters.insert(std::pair(mimeType, counter)); + while (true) { + get_token(); + auto mimeType = token; + get_token(); + while (token == ";") { + //read param + mimeType += ";"; + get_token(); + mimeType += token; //key + get_token(); + if (token != "=") + break; + mimeType += "="; + get_token(); + mimeType += token; //value + get_token(); } + if (currentIt == tokens.end() || token != "=") + break; + + //read count + zim::article_index_type count; + get_token(); + if(!sscanf(token.c_str(), "%u", &count)) + break; + counters.insert({mimeType, count}); + get_token(); + if (token != ";") + break; } return counters; diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index e050442ba..a0eac7ba2 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -268,7 +268,8 @@ std::string kiwix::urlDecode(const std::string& value, bool component) /* Split string in a token array */ std::vector kiwix::split(const std::string& str, const std::string& delims, - bool trimEmpty) + bool trimEmpty, + bool keepDelim) { std::string::size_type lastPos = 0; std::string::size_type pos = 0; @@ -279,6 +280,9 @@ std::vector kiwix::split(const std::string& str, if (!trimEmpty || !token.empty()) { tokens.push_back(token); } + if (keepDelim) { + tokens.push_back(str.substr(pos, 1)); + } lastPos = pos + 1; } diff --git a/test/counterParsing.cpp b/test/counterParsing.cpp index bb17b3b32..fd3ed5da9 100644 --- a/test/counterParsing.cpp +++ b/test/counterParsing.cpp @@ -51,30 +51,55 @@ TEST(ParseCounterTest, simpleMimeType) ASSERT_EQ(parse(counterStr), counterMap) << counterStr; } } -/* + TEST(ParseCounterTest, paramMimeType) { { std::string counterStr = "text/html;raw=true=1"; - CounterType counterMap = {{"foo", 1}}; - ASSERT_EQ(parse(counterStr), counterMap); + CounterType counterMap = {{"text/html;raw=true", 1}}; + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; } { std::string counterStr = "foo=1;text/html;raw=true=50;bar=2"; CounterType counterMap = {{"foo", 1}, {"text/html;raw=true", 50}, {"bar", 2}}; - ASSERT_EQ(parse(counterStr), counterMap); + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; } { std::string counterStr = "foo=1;text/html;raw=true;param=value=50;bar=2"; CounterType counterMap = {{"foo", 1}, {"text/html;raw=true;param=value", 50}, {"bar", 2}}; - ASSERT_EQ(parse(counterStr), counterMap); + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; } { std::string counterStr = "foo=1;text/html;raw=true=50;bar=2"; CounterType counterMap = {{"foo", 1}, {"text/html;raw=true", 50}, {"bar", 2}}; - ASSERT_EQ(parse(counterStr), counterMap); + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; } -}*/ + { + std::string counterStr = "application/javascript=8;text/html=3;application/warc-headers=28364;text/html;raw=true=6336;text/css=47;text/javascript=98;image/png=968;image/webp=24;application/json=3694;image/gif=10274;image/jpeg=1582;font/woff2=25;text/plain=284;application/atom+xml=247;application/x-www-form-urlencoded=9;video/mp4=9;application/x-javascript=7;application/xml=1;image/svg+xml=5"; + CounterType counterMap = { + {"application/javascript", 8}, + {"text/html", 3}, + {"application/warc-headers", 28364}, + {"text/html;raw=true", 6336}, + {"text/css", 47}, + {"text/javascript", 98}, + {"image/png", 968}, + {"image/webp", 24}, + {"application/json", 3694}, + {"image/gif", 10274}, + {"image/jpeg", 1582}, + {"font/woff2", 25}, + {"text/plain", 284}, + {"application/atom+xml", 247}, + {"application/x-www-form-urlencoded", 9}, + {"video/mp4", 9}, + {"application/x-javascript", 7}, + {"application/xml", 1}, + {"image/svg+xml", 5} + }; + ASSERT_EQ(parse(counterStr), counterMap) << counterStr; + } +} TEST(ParseCounterTest, wrongType) { @@ -96,14 +121,14 @@ TEST(ParseCounterTest, wrongType) CounterType counterMap = {{"text/html", 50}}; ASSERT_EQ(parse(counterStr), counterMap) << counterStr; } - /*{ + { std::string counterStr = "text/html;foo=20"; ASSERT_EQ(parse(counterStr), empty) << counterStr; } { std::string counterStr = "text/html;foo=20;"; ASSERT_EQ(parse(counterStr), empty) << counterStr; - }*/ + } { std::string counterStr = "text/html=50;;foo"; CounterType counterMap = {{"text/html", 50}}; diff --git a/test/stringTools.cpp b/test/stringTools.cpp index 5e0b133c8..af9c25bdf 100644 --- a/test/stringTools.cpp +++ b/test/stringTools.cpp @@ -23,7 +23,7 @@ namespace kiwix { std::string join(const std::vector& list, const std::string& sep); -std::vector split(const std::string& base, const std::string& sep, bool trimEmpty); +std::vector split(const std::string& base, const std::string& sep, bool trimEmpty, bool keepDelim); }; using namespace kiwix; @@ -40,17 +40,22 @@ TEST(stringTools, join) TEST(stringTools, split) { std::vector list1 = { "a", "b", "c" }; - ASSERT_EQ(split("a;b;c", ";", false), list1); - ASSERT_EQ(split("a;b;c", ";", true), list1); + ASSERT_EQ(split("a;b;c", ";", false, false), list1); + ASSERT_EQ(split("a;b;c", ";", true, false), list1); std::vector list2 = { "", "a", "b", "c" }; - ASSERT_EQ(split(";a;b;c", ";", false), list2); - ASSERT_EQ(split(";a;b;c", ";", true), list1); + ASSERT_EQ(split(";a;b;c", ";", false, false), list2); + ASSERT_EQ(split(";a;b;c", ";", true, false), list1); std::vector list3 = { "", "a", "b", "c", ""}; - ASSERT_EQ(split(";a;b;c;", ";", false), list3); - ASSERT_EQ(split(";a;b;c;", ";", true), list1); + ASSERT_EQ(split(";a;b;c;", ";", false, false), list3); + ASSERT_EQ(split(";a;b;c;", ";", true, false), list1); std::vector list4 = { "", "a", "b", "", "c", ""}; - ASSERT_EQ(split(";a;b;;c;", ";", false), list4); - ASSERT_EQ(split(";a;b;;c;", ";", true), list1); + ASSERT_EQ(split(";a;b;;c;", ";", false, false), list4); + ASSERT_EQ(split(";a;b;;c;", ";", true, false), list1); + + std::vector list5 = { ";", "a", ";", "b", "=", ";", "c", "=", "d", ";"}; + ASSERT_EQ(split(";a;b=;c=d;", ";=", true, true), list5); + std::vector list6 = { "", ";", "a", ";", "b", "=", "", ";", "c", "=", "d", ";", ""}; + ASSERT_EQ(split(";a;b=;c=d;", ";=", false, true), list6); } }; From ed32e16db29961b1fff8ed3f6fa5918b1a799080 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 28 Oct 2020 16:08:37 +0100 Subject: [PATCH 4/5] Use a reference in `test/server.cpp` loop. This is mainly to make the macos CI pass. --- test/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/server.cpp b/test/server.cpp index af4837123..2e18657c0 100644 --- a/test/server.cpp +++ b/test/server.cpp @@ -77,7 +77,7 @@ private: // data ZimFileServer::ZimFileServer(int serverPort, const FilePathCollection& zimpaths) : manager(&this->library) { - for ( const auto zimpath : zimpaths ) { + for ( const auto& zimpath : zimpaths ) { if (!manager.addBookFromPath(zimpath, zimpath, "", false)) throw std::runtime_error("Unable to add the ZIM file '" + zimpath + "'"); } From 0f8fe1f63fdbcf4a1fc85886573c5a3cada11441 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Thu, 29 Oct 2020 14:11:27 +0400 Subject: [PATCH 5/5] Alternative implementation of parseMimetypeCounter() --- src/tools/otherTools.cpp | 95 ++++++++++++++++++++++------------------ test/counterParsing.cpp | 4 ++ 2 files changed, 57 insertions(+), 42 deletions(-) diff --git a/src/tools/otherTools.cpp b/src/tools/otherTools.cpp index 63f10c306..b5db9d492 100644 --- a/src/tools/otherTools.cpp +++ b/src/tools/otherTools.cpp @@ -18,6 +18,7 @@ */ #include "tools/otherTools.h" +#include #ifdef _WIN32 #include @@ -280,52 +281,62 @@ bool kiwix::convertStrToBool(const std::string& value) throw std::domain_error(ss.str()); } -#define get_token() if (currentIt==tokens.end()) {break;} else { token = *currentIt++; } +namespace +{ +// The counter metadata format is a list of item separated by a `;` : +// item0;item1;item2 +// Each item is a "tuple" mimetype=number. +// However, the mimetype may contains parameters: +// text/html;raw=true;foo=bar +// So the final format may be complex to parse: +// key0=value0;key1;foo=bar=value1;key2=value2 + +typedef kiwix::MimeCounterType::value_type MimetypeAndCounter; + +std::string readFullMimetypeAndCounterString(std::istream& in) +{ + std::string mtcStr, params; + getline(in, mtcStr, ';'); + if ( mtcStr.find('=') == std::string::npos ) + { + do + { + if ( !getline(in, params, ';' ) ) + return std::string(); + mtcStr += ";" + params; + } + while ( std::count(params.begin(), params.end(), '=') != 2 ); + } + return mtcStr; +} + +MimetypeAndCounter parseASingleMimetypeCounter(const std::string& s) +{ + const std::string::size_type k = s.find_last_of("="); + if ( k != std::string::npos ) + { + const std::string mimeType = s.substr(0, k); + std::istringstream counterSS(s.substr(k+1)); + unsigned int counter; + if (counterSS >> counter && counterSS.eof()) + return MimetypeAndCounter{mimeType, counter}; + } + return MimetypeAndCounter{"", 0}; +} + +} // unnamed namespace + kiwix::MimeCounterType kiwix::parseMimetypeCounter(const std::string& counterData) { - // The counter metadata format is a list of item separated by a `;` : - // item0;item1;item2 - // Each item is a "tuple" mimetype=number. - // However, the mimetype may contains parameters: - // text/html;raw=true;foo=bar - // So the final format may be complex to parse: - // key0=value0;key1;foo=bar=value1;key2=value2 - kiwix::MimeCounterType counters; + std::istringstream ss(counterData); - auto tokens = split(counterData, ";=", true, true); - auto currentIt = tokens.begin(); - std::string token; - - while (true) { - get_token(); - auto mimeType = token; - get_token(); - while (token == ";") { - //read param - mimeType += ";"; - get_token(); - mimeType += token; //key - get_token(); - if (token != "=") - break; - mimeType += "="; - get_token(); - mimeType += token; //value - get_token(); - } - if (currentIt == tokens.end() || token != "=") - break; - - //read count - zim::article_index_type count; - get_token(); - if(!sscanf(token.c_str(), "%u", &count)) - break; - counters.insert({mimeType, count}); - get_token(); - if (token != ";") - break; + while (ss) + { + const std::string mtcStr = readFullMimetypeAndCounterString(ss); + const MimetypeAndCounter mtc = parseASingleMimetypeCounter(mtcStr); + if ( !mtc.first.empty() ) + counters.insert(mtc); } return counters; diff --git a/test/counterParsing.cpp b/test/counterParsing.cpp index fd3ed5da9..b6fbde48e 100644 --- a/test/counterParsing.cpp +++ b/test/counterParsing.cpp @@ -116,6 +116,10 @@ TEST(ParseCounterTest, wrongType) std::string counterStr = "text/html=foo"; ASSERT_EQ(parse(counterStr), empty) << counterStr; } + { + std::string counterStr = "text/html=123foo"; + ASSERT_EQ(parse(counterStr), empty) << counterStr; + } { std::string counterStr = "text/html=50;foo"; CounterType counterMap = {{"text/html", 50}};