Metadata
This commit is contained in:
Matthieu Gautier 2019-09-17 11:21:19 +02:00 committed by GitHub
commit 15d5b4ed58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 355 additions and 64 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
.idea/
*.swp
subprojects/googletest-release*

View File

@ -158,31 +158,7 @@ class Reader
* @param[out] value The value will be set to the content of the metadata.
* @return True if it was possible to get the content of the metadata.
*/
bool getMetatag(const string& name, string& value) const;
/**
* Get the title of the zim file.
*
* @return The title of zim file as specified in the zim metadata.
* If no title has been set, return a title computed from the
* file path.
*/
string getTitle() const;
/**
* Get the description of the zim file.
*
* @return The description of the zim file as specified in the zim metadata.
* If no description has been set, return the subtitle.
*/
string getDescription() const;
/**
* Get the language of the zim file.
*
* @return The language of the zim file as specified in the zim metadata.
*/
string getLanguage() const;
bool getMetadata(const string& name, string& value) const;
/**
* Get the name of the zim file.
@ -192,18 +168,13 @@ class Reader
string getName() const;
/**
* Get the tags of the zim file.
* Get the title of the zim file.
*
* @return The tags of the zim file as specified in the zim metadata.
* @return The title of zim file as specified in the zim metadata.
* If no title has been set, return a title computed from the
* file path.
*/
string getTags() const;
/**
* Get the date of the zim file.
*
* @return The date of the zim file as specified in the zim metadata.
*/
string getDate() const;
string getTitle() const;
/**
* Get the creator of the zim file.
@ -219,6 +190,100 @@ class Reader
*/
string getPublisher() const;
/**
* Get the date of the zim file.
*
* @return The date of the zim file as specified in the zim metadata.
*/
string getDate() const;
/**
* Get the description of the zim file.
*
* @return The description of the zim file as specified in the zim metadata.
* If no description has been set, return the subtitle.
*/
string getDescription() const;
/**
* Get the long description of the zim file.
*
* @return The long description of the zim file as specifed in the zim metadata.
*/
string getLongDescription() const;
/**
* Get the language of the zim file.
*
* @return The language of the zim file as specified in the zim metadata.
*/
string getLanguage() const;
/**
* Get the license of the zim file.
*
* @return The license of the zim file as specified in the zim metadata.
*/
string getLicense() const;
/**
* Get the tags of the zim file.
*
* @param original If true, return the original tags as specified in the zim metadata.
* Else, try to convert it to the new 'normalized' format.
* @return The tags of the zim file.
*/
string getTags(bool original=false) const;
/**
* Get the value (as a string) of a specific tag.
*
* According to https://wiki.openzim.org/wiki/Tags
*
* @return The value of the specified tag.
* @throw std::out_of_range if the specified tag is not found.
*/
string getTagStr(const std::string& tagName) const;
/**
* Get the boolean value of a specific tag.
*
* According to https://wiki.openzim.org/wiki/Tags
*
* @return The boolean value of the specified tag.
* @throw std::out_of_range if the specified tag is not found.
* std::domain_error if the value of the tag cannot be convert to bool.
*/
bool getTagBool(const std::string& tagName) const;
/**
* Get the relations of the zim file.
*
* @return The relation of the zim file as specified in the zim metadata.
*/
string getRelation() const;
/**
* Get the flavour of the zim file.
*
* @return The flavour of the zim file as specified in the zim metadata.
*/
string getFlavour() const;
/**
* Get the source of the zim file.
*
* @return The source of the zim file as specified in the zim metadata.
*/
string getSource() const;
/**
* Get the scraper of the zim file.
*
* @return The scraper of the zim file as specified in the zim metadata.
*/
string getScraper() const;
/**
* Get the origId of the zim file.
*

View File

@ -47,6 +47,7 @@ std::vector<std::string> split(const std::string&, const std::string&);
std::vector<std::string> split(const char*, const char*);
std::vector<std::string> split(const std::string&, const char*);
std::vector<std::string> split(const char*, const std::string&);
std::string join(const std::vector<std::string>& list, const std::string& sep);
std::string ucAll(const std::string& word);
std::string lcAll(const std::string& word);

View File

@ -278,7 +278,7 @@ string Reader::getZimFilePath() const
return this->zimFilePath;
}
/* Return a metatag value */
bool Reader::getMetatag(const string& name, string& value) const
bool Reader::getMetadata(const string& name, string& value) const
{
try {
auto entry = getEntryFromPath("M/"+name);
@ -289,10 +289,17 @@ bool Reader::getMetatag(const string& name, string& value) const
}
}
#define METADATA(NAME) std::string v; getMetadata(NAME, v); return v;
string Reader::getName() const
{
METADATA("Name")
}
string Reader::getTitle() const
{
string value;
this->getMetatag("Title", value);
this->getMetadata("Title", value);
if (value.empty()) {
value = getLastPathElement(zimFileHandler->getFilename());
std::replace(value.begin(), value.end(), '_', ' ');
@ -302,65 +309,164 @@ string Reader::getTitle() const
return value;
}
string Reader::getName() const
string Reader::getCreator() const
{
string value;
this->getMetatag("Name", value);
return value;
METADATA("Creator")
}
string Reader::getTags() const
string Reader::getPublisher() const
{
string value;
this->getMetatag("Tags", value);
return value;
METADATA("Publisher")
}
string Reader::getDate() const
{
METADATA("Date")
}
string Reader::getDescription() const
{
string value;
this->getMetatag("Description", value);
this->getMetadata("Description", value);
/* Mediawiki Collection tends to use the "Subtitle" name */
if (value.empty()) {
this->getMetatag("Subtitle", value);
this->getMetadata("Subtitle", value);
}
return value;
}
string Reader::getLongDescription() const
{
METADATA("LongDescription")
}
string Reader::getLanguage() const
{
string value;
this->getMetatag("Language", value);
return value;
METADATA("Language")
}
string Reader::getDate() const
string Reader::getLicense() const
{
string value;
this->getMetatag("Date", value);
return value;
METADATA("License")
}
string Reader::getCreator() const
std::vector<std::string> convertTags(const std::string& tags_str)
{
string value;
this->getMetatag("Creator", value);
return value;
auto tags = split(tags_str, ";");
std::vector<std::string> tagsList;
bool picSeen(false), vidSeen(false), detSeen(false), indexSeen(false);
for (auto tag: tags) {
picSeen |= (tag == "nopic" || startsWith(tag, "_pictures:"));
vidSeen |= (tag == "novid" || startsWith(tag, "_videos:"));
detSeen |= (tag == "nodet" || startsWith(tag, "_details:"));
indexSeen |= startsWith(tag, "_ftindex");
if (tag == "nopic") {
tagsList.push_back("_pictures:no");
} else if (tag == "novid") {
tagsList.push_back("_videos:no");
} else if (tag == "nodet") {
tagsList.push_back("_details:no");
} else if (tag == "_ftindex") {
tagsList.push_back("_ftindex:yes");
} else {
tagsList.push_back(tag);
}
}
if (!indexSeen) {
tagsList.push_back("_ftindex:no");
}
if (!picSeen) {
tagsList.push_back("_pictures:yes");
}
if (!vidSeen) {
tagsList.push_back("_videos:yes");
}
if (!detSeen) {
tagsList.push_back("_details:yes");
}
return tagsList;
}
string Reader::getPublisher() const
string Reader::getTags(bool original) const
{
string value;
this->getMetatag("Publisher", value);
return value;
string tags_str;
getMetadata("Tags", tags_str);
if (original) {
return tags_str;
}
auto tags = convertTags(tags_str);
return join(tags, ";");
}
string getTagValueFromTagList(const std::vector<std::string>& tagList, const std::string& tagName)
{
for (auto tag: tagList) {
if (tag[0] == '_') {
auto delimPos = tag.find(':');
if (delimPos == string::npos) {
// No delimiter... what to do ?
continue;
}
auto cTagName = tag.substr(1, delimPos-1);
auto cTagValue = tag.substr(delimPos+1);
if (cTagName == tagName) {
return cTagValue;
}
}
}
std::stringstream ss;
ss << tagName << " cannot be found";
throw std::out_of_range(ss.str());
}
string Reader::getTagStr(const std::string& tagName) const
{
string tags_str;
getMetadata("Tags", tags_str);
return getTagValueFromTagList(convertTags(tags_str), tagName);
}
bool Reader::getTagBool(const std::string& tagName) const
{
auto tagValue = getTagStr(tagName);
if (tagValue == "yes") {
return true;
} else if (tagValue == "no") {
return false;
} else {
std::stringstream ss;
ss << "Tag value '" << tagValue << "' for " << tagName << " cannot be converted to bool.";
throw std::domain_error(ss.str());
}
}
string Reader::getRelation() const
{
METADATA("Relation")
}
string Reader::getFlavour() const
{
METADATA("Flavour")
}
string Reader::getSource() const
{
METADATA("Source")
}
string Reader::getScraper() const
{
METADATA("Scraper")
}
#undef METADATA
string Reader::getOrigId() const
{
string value;
this->getMetatag("startfileuid", value);
this->getMetadata("startfileuid", value);
if (value.empty()) {
return "";
}

View File

@ -298,6 +298,21 @@ std::vector<std::string> kiwix::split(const std::string& lhs, const char* rhs)
return split(lhs.c_str(), rhs);
}
std::string kiwix::join(const std::vector<std::string>& list, const std::string& sep)
{
std::stringstream ss;
bool first = true;
for (auto& s:list) {
if (first) {
ss << sep;
first = false;
}
ss << s;
}
return ss.str();
}
std::string kiwix::ucFirst(const std::string& word)
{
if (word.empty()) {

View File

@ -3,7 +3,8 @@
tests = [
'parseUrl',
'library',
'regex'
'regex',
'tagParsing'
]

102
test/tagParsing.cpp Normal file
View File

@ -0,0 +1,102 @@
/*
* Copyright (C) 2019 Matthieu Gautier
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "gtest/gtest.h"
#include <string>
#include <vector>
namespace kiwix {
std::vector<std::string> convertTags(const std::string& tags);
std::string getTagValueFromTagList(const std::vector<std::string>& tagList, const std::string& tagName);
};
using namespace kiwix;
#define parse_tag getTagValueFromTagList
namespace
{
TEST(ParseTagTest, convert)
{
{
std::string tagStr = "";
std::vector<std::string> tagList = {"_ftindex:no", "_pictures:yes", "_videos:yes", "_details:yes"};
ASSERT_EQ(convertTags(tagStr), tagList);
}
{
std::string tagStr = "_category:foo;bar";
std::vector<std::string> tagList = {"_category:foo", "bar", "_ftindex:no", "_pictures:yes", "_videos:yes", "_details:yes"};
ASSERT_EQ(convertTags(tagStr), tagList);
}
{
std::string tagStr = "_ftindex:no;_pictures:yes;_videos:yes;_details:yes;_category:foo;bar";
std::vector<std::string> tagList = {"_ftindex:no", "_pictures:yes", "_videos:yes", "_details:yes", "_category:foo", "bar"};
ASSERT_EQ(convertTags(tagStr), tagList);
}
{
std::string tagStr = "_ftindex:yes;_pictures:no;_videos:no;_details:no;_category:foo;bar";
std::vector<std::string> tagList = {"_ftindex:yes", "_pictures:no", "_videos:no", "_details:no", "_category:foo", "bar"};
ASSERT_EQ(convertTags(tagStr), tagList);
}
{
std::string tagStr = "_ftindex;nopic;novid;nodet;foo;bar";
std::vector<std::string> tagList = {"_ftindex:yes", "_pictures:no", "_videos:no", "_details:no", "foo", "bar"};
ASSERT_EQ(convertTags(tagStr), tagList);
}
}
TEST(ParseTagTest, valid)
{
std::string tagStr = "_ftindex:yes;_pictures:no;_videos:no;_details:yes;_category:foo;bar";
auto tagList = convertTags(tagStr);
ASSERT_EQ(parse_tag(tagList, "ftindex"), "yes");
ASSERT_EQ(parse_tag(tagList, "pictures"), "no");
ASSERT_EQ(parse_tag(tagList, "category"), "foo");
ASSERT_EQ(parse_tag(tagList, "details"), "yes");
ASSERT_THROW(parse_tag(tagList, "detail"), std::out_of_range);
}
TEST(ParseTagTest, compat)
{
std::string tagStr = "_ftindex;nopic;foo;bar";
auto tagList = convertTags(tagStr);
ASSERT_EQ(parse_tag(tagList, "ftindex"), "yes");
ASSERT_EQ(parse_tag(tagList, "pictures"), "no");
ASSERT_EQ(parse_tag(tagList, "videos"), "yes");
ASSERT_EQ(parse_tag(tagList, "details"), "yes");
}
TEST(ParseTagTest, invalid)
{
std::string tagStr = "_ftindex:y;_pictures;_videos:;_details:yes;_details:no;_category:foo;bar";
auto tagList = convertTags(tagStr);
ASSERT_EQ(parse_tag(tagList, "ftindex"), "y");
ASSERT_EQ(parse_tag(tagList, "pictures"), "yes");
ASSERT_EQ(parse_tag(tagList, "videos"), "");
ASSERT_EQ(parse_tag(tagList, "details"), "yes");
}
};
int main(int argc, char** argv)
{
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}