mirror of https://github.com/kiwix/libkiwix.git
Merge pull request #418 from kiwix/fix_counter_parsing
This commit is contained in:
commit
0f8caba3a5
|
@ -22,6 +22,8 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <zim/zim.h>
|
||||||
|
|
||||||
namespace pugi {
|
namespace pugi {
|
||||||
class xml_node;
|
class xml_node;
|
||||||
|
@ -41,6 +43,8 @@ namespace kiwix
|
||||||
const std::string& tagName);
|
const std::string& tagName);
|
||||||
bool convertStrToBool(const std::string& value);
|
bool convertStrToBool(const std::string& value);
|
||||||
|
|
||||||
|
using MimeCounterType = std::map<const std::string, zim::article_index_type>;
|
||||||
|
MimeCounterType parseMimetypeCounter(const std::string& counterData);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -43,7 +43,7 @@ void loadICUExternalTables();
|
||||||
std::string urlEncode(const std::string& value, bool encodeReserved = false);
|
std::string urlEncode(const std::string& value, bool encodeReserved = false);
|
||||||
std::string urlDecode(const std::string& value, bool component = false);
|
std::string urlDecode(const std::string& value, bool component = false);
|
||||||
|
|
||||||
std::vector<std::string> split(const std::string&, const std::string&, bool trimEmpty = true);
|
std::vector<std::string> split(const std::string& str, const std::string& delims, bool trimEmpty = true, bool keepDelim = false);
|
||||||
std::string join(const std::vector<std::string>& list, const std::string& sep);
|
std::string join(const std::vector<std::string>& list, const std::string& sep);
|
||||||
|
|
||||||
std::string ucAll(const std::string& word);
|
std::string ucAll(const std::string& word);
|
||||||
|
|
|
@ -103,29 +103,16 @@ zim::File* Reader::getZimFileHandler() const
|
||||||
{
|
{
|
||||||
return this->zimFileHandler;
|
return this->zimFileHandler;
|
||||||
}
|
}
|
||||||
std::map<const std::string, unsigned int> Reader::parseCounterMetadata() const
|
|
||||||
{
|
|
||||||
std::map<const std::string, unsigned int> counters;
|
|
||||||
string mimeType, item, counterString;
|
|
||||||
unsigned int counter;
|
|
||||||
|
|
||||||
|
MimeCounterType Reader::parseCounterMetadata() const
|
||||||
|
{
|
||||||
zim::Article article = this->zimFileHandler->getArticle('M', "Counter");
|
zim::Article article = this->zimFileHandler->getArticle('M', "Counter");
|
||||||
|
|
||||||
if (article.good()) {
|
if (article.good()) {
|
||||||
stringstream ssContent(article.getData());
|
return parseMimetypeCounter(article.getData());
|
||||||
|
|
||||||
while (getline(ssContent, item, ';')) {
|
|
||||||
stringstream ssItem(item);
|
|
||||||
getline(ssItem, mimeType, '=');
|
|
||||||
getline(ssItem, counterString, '=');
|
|
||||||
if (!counterString.empty() && !mimeType.empty()) {
|
|
||||||
sscanf(counterString.c_str(), "%u", &counter);
|
|
||||||
counters.insert(pair<string, int>(mimeType, counter));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return counters;
|
return MimeCounterType();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Get the count of articles which can be indexed/displayed */
|
/* Get the count of articles which can be indexed/displayed */
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "tools/otherTools.h"
|
#include "tools/otherTools.h"
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
@ -280,3 +281,63 @@ bool kiwix::convertStrToBool(const std::string& value)
|
||||||
throw std::domain_error(ss.str());
|
throw std::domain_error(ss.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
// The counter metadata format is a list of item separated by a `;` :
|
||||||
|
// item0;item1;item2
|
||||||
|
// Each item is a "tuple" mimetype=number.
|
||||||
|
// However, the mimetype may contains parameters:
|
||||||
|
// text/html;raw=true;foo=bar
|
||||||
|
// So the final format may be complex to parse:
|
||||||
|
// key0=value0;key1;foo=bar=value1;key2=value2
|
||||||
|
|
||||||
|
typedef kiwix::MimeCounterType::value_type MimetypeAndCounter;
|
||||||
|
|
||||||
|
std::string readFullMimetypeAndCounterString(std::istream& in)
|
||||||
|
{
|
||||||
|
std::string mtcStr, params;
|
||||||
|
getline(in, mtcStr, ';');
|
||||||
|
if ( mtcStr.find('=') == std::string::npos )
|
||||||
|
{
|
||||||
|
do
|
||||||
|
{
|
||||||
|
if ( !getline(in, params, ';' ) )
|
||||||
|
return std::string();
|
||||||
|
mtcStr += ";" + params;
|
||||||
|
}
|
||||||
|
while ( std::count(params.begin(), params.end(), '=') != 2 );
|
||||||
|
}
|
||||||
|
return mtcStr;
|
||||||
|
}
|
||||||
|
|
||||||
|
MimetypeAndCounter parseASingleMimetypeCounter(const std::string& s)
|
||||||
|
{
|
||||||
|
const std::string::size_type k = s.find_last_of("=");
|
||||||
|
if ( k != std::string::npos )
|
||||||
|
{
|
||||||
|
const std::string mimeType = s.substr(0, k);
|
||||||
|
std::istringstream counterSS(s.substr(k+1));
|
||||||
|
unsigned int counter;
|
||||||
|
if (counterSS >> counter && counterSS.eof())
|
||||||
|
return MimetypeAndCounter{mimeType, counter};
|
||||||
|
}
|
||||||
|
return MimetypeAndCounter{"", 0};
|
||||||
|
}
|
||||||
|
|
||||||
|
} // unnamed namespace
|
||||||
|
|
||||||
|
kiwix::MimeCounterType kiwix::parseMimetypeCounter(const std::string& counterData)
|
||||||
|
{
|
||||||
|
kiwix::MimeCounterType counters;
|
||||||
|
std::istringstream ss(counterData);
|
||||||
|
|
||||||
|
while (ss)
|
||||||
|
{
|
||||||
|
const std::string mtcStr = readFullMimetypeAndCounterString(ss);
|
||||||
|
const MimetypeAndCounter mtc = parseASingleMimetypeCounter(mtcStr);
|
||||||
|
if ( !mtc.first.empty() )
|
||||||
|
counters.insert(mtc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return counters;
|
||||||
|
}
|
||||||
|
|
|
@ -268,7 +268,8 @@ std::string kiwix::urlDecode(const std::string& value, bool component)
|
||||||
/* Split string in a token array */
|
/* Split string in a token array */
|
||||||
std::vector<std::string> kiwix::split(const std::string& str,
|
std::vector<std::string> kiwix::split(const std::string& str,
|
||||||
const std::string& delims,
|
const std::string& delims,
|
||||||
bool trimEmpty)
|
bool trimEmpty,
|
||||||
|
bool keepDelim)
|
||||||
{
|
{
|
||||||
std::string::size_type lastPos = 0;
|
std::string::size_type lastPos = 0;
|
||||||
std::string::size_type pos = 0;
|
std::string::size_type pos = 0;
|
||||||
|
@ -279,6 +280,9 @@ std::vector<std::string> kiwix::split(const std::string& str,
|
||||||
if (!trimEmpty || !token.empty()) {
|
if (!trimEmpty || !token.empty()) {
|
||||||
tokens.push_back(token);
|
tokens.push_back(token);
|
||||||
}
|
}
|
||||||
|
if (keepDelim) {
|
||||||
|
tokens.push_back(str.substr(pos, 1));
|
||||||
|
}
|
||||||
lastPos = pos + 1;
|
lastPos = pos + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,143 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2019 Matthieu Gautier
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License as
|
||||||
|
* published by the Free Software Foundation; either version 2 of the
|
||||||
|
* License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but
|
||||||
|
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
|
||||||
|
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
|
||||||
|
* NON-INFRINGEMENT. See the GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <zim/zim.h>
|
||||||
|
|
||||||
|
namespace kiwix {
|
||||||
|
using CounterType = std::map<const std::string, zim::article_index_type>;
|
||||||
|
CounterType parseMimetypeCounter(const std::string& counterData);
|
||||||
|
};
|
||||||
|
|
||||||
|
using namespace kiwix;
|
||||||
|
#define parse parseMimetypeCounter
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
TEST(ParseCounterTest, simpleMimeType)
|
||||||
|
{
|
||||||
|
{
|
||||||
|
std::string counterStr = "";
|
||||||
|
CounterType counterMap = {};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "foo=1";
|
||||||
|
CounterType counterMap = {{"foo", 1}};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "foo=1;text/html=50;";
|
||||||
|
CounterType counterMap = {{"foo", 1}, {"text/html", 50}};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParseCounterTest, paramMimeType)
|
||||||
|
{
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html;raw=true=1";
|
||||||
|
CounterType counterMap = {{"text/html;raw=true", 1}};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "foo=1;text/html;raw=true=50;bar=2";
|
||||||
|
CounterType counterMap = {{"foo", 1}, {"text/html;raw=true", 50}, {"bar", 2}};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "foo=1;text/html;raw=true;param=value=50;bar=2";
|
||||||
|
CounterType counterMap = {{"foo", 1}, {"text/html;raw=true;param=value", 50}, {"bar", 2}};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "foo=1;text/html;raw=true=50;bar=2";
|
||||||
|
CounterType counterMap = {{"foo", 1}, {"text/html;raw=true", 50}, {"bar", 2}};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "application/javascript=8;text/html=3;application/warc-headers=28364;text/html;raw=true=6336;text/css=47;text/javascript=98;image/png=968;image/webp=24;application/json=3694;image/gif=10274;image/jpeg=1582;font/woff2=25;text/plain=284;application/atom+xml=247;application/x-www-form-urlencoded=9;video/mp4=9;application/x-javascript=7;application/xml=1;image/svg+xml=5";
|
||||||
|
CounterType counterMap = {
|
||||||
|
{"application/javascript", 8},
|
||||||
|
{"text/html", 3},
|
||||||
|
{"application/warc-headers", 28364},
|
||||||
|
{"text/html;raw=true", 6336},
|
||||||
|
{"text/css", 47},
|
||||||
|
{"text/javascript", 98},
|
||||||
|
{"image/png", 968},
|
||||||
|
{"image/webp", 24},
|
||||||
|
{"application/json", 3694},
|
||||||
|
{"image/gif", 10274},
|
||||||
|
{"image/jpeg", 1582},
|
||||||
|
{"font/woff2", 25},
|
||||||
|
{"text/plain", 284},
|
||||||
|
{"application/atom+xml", 247},
|
||||||
|
{"application/x-www-form-urlencoded", 9},
|
||||||
|
{"video/mp4", 9},
|
||||||
|
{"application/x-javascript", 7},
|
||||||
|
{"application/xml", 1},
|
||||||
|
{"image/svg+xml", 5}
|
||||||
|
};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParseCounterTest, wrongType)
|
||||||
|
{
|
||||||
|
CounterType empty = {};
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html";
|
||||||
|
ASSERT_EQ(parse(counterStr), empty) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html=";
|
||||||
|
ASSERT_EQ(parse(counterStr), empty) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html=foo";
|
||||||
|
ASSERT_EQ(parse(counterStr), empty) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html=123foo";
|
||||||
|
ASSERT_EQ(parse(counterStr), empty) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html=50;foo";
|
||||||
|
CounterType counterMap = {{"text/html", 50}};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html;foo=20";
|
||||||
|
ASSERT_EQ(parse(counterStr), empty) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html;foo=20;";
|
||||||
|
ASSERT_EQ(parse(counterStr), empty) << counterStr;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::string counterStr = "text/html=50;;foo";
|
||||||
|
CounterType counterMap = {{"text/html", 50}};
|
||||||
|
ASSERT_EQ(parse(counterStr), counterMap) << counterStr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
|
@ -3,6 +3,7 @@ tests = [
|
||||||
'library',
|
'library',
|
||||||
'regex',
|
'regex',
|
||||||
'tagParsing',
|
'tagParsing',
|
||||||
|
'counterParsing',
|
||||||
'stringTools',
|
'stringTools',
|
||||||
'pathTools',
|
'pathTools',
|
||||||
'kiwixserve',
|
'kiwixserve',
|
||||||
|
|
|
@ -77,7 +77,7 @@ private: // data
|
||||||
ZimFileServer::ZimFileServer(int serverPort, const FilePathCollection& zimpaths)
|
ZimFileServer::ZimFileServer(int serverPort, const FilePathCollection& zimpaths)
|
||||||
: manager(&this->library)
|
: manager(&this->library)
|
||||||
{
|
{
|
||||||
for ( const auto zimpath : zimpaths ) {
|
for ( const auto& zimpath : zimpaths ) {
|
||||||
if (!manager.addBookFromPath(zimpath, zimpath, "", false))
|
if (!manager.addBookFromPath(zimpath, zimpath, "", false))
|
||||||
throw std::runtime_error("Unable to add the ZIM file '" + zimpath + "'");
|
throw std::runtime_error("Unable to add the ZIM file '" + zimpath + "'");
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
|
|
||||||
namespace kiwix {
|
namespace kiwix {
|
||||||
std::string join(const std::vector<std::string>& list, const std::string& sep);
|
std::string join(const std::vector<std::string>& list, const std::string& sep);
|
||||||
std::vector<std::string> split(const std::string& base, const std::string& sep, bool trimEmpty);
|
std::vector<std::string> split(const std::string& base, const std::string& sep, bool trimEmpty, bool keepDelim);
|
||||||
};
|
};
|
||||||
|
|
||||||
using namespace kiwix;
|
using namespace kiwix;
|
||||||
|
@ -40,17 +40,22 @@ TEST(stringTools, join)
|
||||||
TEST(stringTools, split)
|
TEST(stringTools, split)
|
||||||
{
|
{
|
||||||
std::vector<std::string> list1 = { "a", "b", "c" };
|
std::vector<std::string> list1 = { "a", "b", "c" };
|
||||||
ASSERT_EQ(split("a;b;c", ";", false), list1);
|
ASSERT_EQ(split("a;b;c", ";", false, false), list1);
|
||||||
ASSERT_EQ(split("a;b;c", ";", true), list1);
|
ASSERT_EQ(split("a;b;c", ";", true, false), list1);
|
||||||
std::vector<std::string> list2 = { "", "a", "b", "c" };
|
std::vector<std::string> list2 = { "", "a", "b", "c" };
|
||||||
ASSERT_EQ(split(";a;b;c", ";", false), list2);
|
ASSERT_EQ(split(";a;b;c", ";", false, false), list2);
|
||||||
ASSERT_EQ(split(";a;b;c", ";", true), list1);
|
ASSERT_EQ(split(";a;b;c", ";", true, false), list1);
|
||||||
std::vector<std::string> list3 = { "", "a", "b", "c", ""};
|
std::vector<std::string> list3 = { "", "a", "b", "c", ""};
|
||||||
ASSERT_EQ(split(";a;b;c;", ";", false), list3);
|
ASSERT_EQ(split(";a;b;c;", ";", false, false), list3);
|
||||||
ASSERT_EQ(split(";a;b;c;", ";", true), list1);
|
ASSERT_EQ(split(";a;b;c;", ";", true, false), list1);
|
||||||
std::vector<std::string> list4 = { "", "a", "b", "", "c", ""};
|
std::vector<std::string> list4 = { "", "a", "b", "", "c", ""};
|
||||||
ASSERT_EQ(split(";a;b;;c;", ";", false), list4);
|
ASSERT_EQ(split(";a;b;;c;", ";", false, false), list4);
|
||||||
ASSERT_EQ(split(";a;b;;c;", ";", true), list1);
|
ASSERT_EQ(split(";a;b;;c;", ";", true, false), list1);
|
||||||
|
|
||||||
|
std::vector<std::string> list5 = { ";", "a", ";", "b", "=", ";", "c", "=", "d", ";"};
|
||||||
|
ASSERT_EQ(split(";a;b=;c=d;", ";=", true, true), list5);
|
||||||
|
std::vector<std::string> list6 = { "", ";", "a", ";", "b", "=", "", ";", "c", "=", "d", ";", ""};
|
||||||
|
ASSERT_EQ(split(";a;b=;c=d;", ";=", false, true), list6);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue