Better parsing of M/Counter

Mimetype may contain a parameters.
Then, the mimetype would be something like "text/html;foo=bar;foz=baz"

It will contains a `;` and `=` and it conflicts with the same operators
we use to separate the items in our list.

We have to use a more advanced algorithm which takes the context into
account.

Fix #416
This commit is contained in:
Matthieu Gautier
2020-10-28 15:58:35 +01:00
parent ef42abea4b
commit 08464f23bc
5 changed files with 94 additions and 32 deletions

View File

@ -280,24 +280,52 @@ bool kiwix::convertStrToBool(const std::string& value)
throw std::domain_error(ss.str());
}
#define get_token() if (currentIt==tokens.end()) {break;} else { token = *currentIt++; }
kiwix::MimeCounterType kiwix::parseMimetypeCounter(const std::string& counterData)
{
// The counter metadata format is a list of item separated by a `;` :
// item0;item1;item2
// Each item is a "tuple" mimetype=number.
// However, the mimetype may contains parameters:
// text/html;raw=true;foo=bar
// So the final format may be complex to parse:
// key0=value0;key1;foo=bar=value1;key2=value2
kiwix::MimeCounterType counters;
std::string item;
unsigned int counter;
std::stringstream ssContent(counterData);
auto tokens = split(counterData, ";=", true, true);
auto currentIt = tokens.begin();
std::string token;
while (getline(ssContent, item, ';')) {
std::string mimeType, counterString;
std::stringstream ssItem(item);
getline(ssItem, mimeType, '=');
getline(ssItem, counterString, '=');
if (!counterString.empty() && !mimeType.empty()) {
if (sscanf(counterString.c_str(), "%u", &counter))
counters.insert(std::pair<std::string, int>(mimeType, counter));
while (true) {
get_token();
auto mimeType = token;
get_token();
while (token == ";") {
//read param
mimeType += ";";
get_token();
mimeType += token; //key
get_token();
if (token != "=")
break;
mimeType += "=";
get_token();
mimeType += token; //value
get_token();
}
if (currentIt == tokens.end() || token != "=")
break;
//read count
zim::article_index_type count;
get_token();
if(!sscanf(token.c_str(), "%u", &count))
break;
counters.insert({mimeType, count});
get_token();
if (token != ";")
break;
}
return counters;

View File

@ -268,7 +268,8 @@ std::string kiwix::urlDecode(const std::string& value, bool component)
/* Split string in a token array */
std::vector<std::string> kiwix::split(const std::string& str,
const std::string& delims,
bool trimEmpty)
bool trimEmpty,
bool keepDelim)
{
std::string::size_type lastPos = 0;
std::string::size_type pos = 0;
@ -279,6 +280,9 @@ std::vector<std::string> kiwix::split(const std::string& str,
if (!trimEmpty || !token.empty()) {
tokens.push_back(token);
}
if (keepDelim) {
tokens.push_back(str.substr(pos, 1));
}
lastPos = pos + 1;
}