From 420be55bfa5eee363d56bfa3f7cac5f784043334 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 12 Sep 2019 15:24:17 +0200 Subject: [PATCH 1/8] Reorder methods to get metadata. Use the same order than https://wiki.openzim.org/wiki/Metadata --- include/reader.h | 56 ++++++++++++++++++++++++------------------------ src/reader.cpp | 40 +++++++++++++++++----------------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/include/reader.h b/include/reader.h index f9c64e6f7..c65859d72 100644 --- a/include/reader.h +++ b/include/reader.h @@ -160,6 +160,13 @@ class Reader */ bool getMetatag(const string& name, string& value) const; + /** + * Get the name of the zim file. + * + * @return The name of the zim file as specified in the zim metadata. + */ + string getName() const; + /** * Get the title of the zim file. * @@ -169,6 +176,27 @@ class Reader */ string getTitle() const; + /** + * Get the creator of the zim file. + * + * @return The creator of the zim file as specified in the zim metadata. + */ + string getCreator() const; + + /** + * Get the publisher of the zim file. + * + * @return The publisher of the zim file as specified in the zim metadata. + */ + string getPublisher() const; + + /** + * Get the date of the zim file. + * + * @return The date of the zim file as specified in the zim metadata. + */ + string getDate() const; + /** * Get the description of the zim file. * @@ -184,13 +212,6 @@ class Reader */ string getLanguage() const; - /** - * Get the name of the zim file. - * - * @return The name of the zim file as specified in the zim metadata. - */ - string getName() const; - /** * Get the tags of the zim file. * @@ -198,27 +219,6 @@ class Reader */ string getTags() const; - /** - * Get the date of the zim file. - * - * @return The date of the zim file as specified in the zim metadata. - */ - string getDate() const; - - /** - * Get the creator of the zim file. - * - * @return The creator of the zim file as specified in the zim metadata. - */ - string getCreator() const; - - /** - * Get the publisher of the zim file. - * - * @return The publisher of the zim file as specified in the zim metadata. - */ - string getPublisher() const; - /** * Get the origId of the zim file. * diff --git a/src/reader.cpp b/src/reader.cpp index aa778dc92..7c3e18985 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -289,6 +289,13 @@ bool Reader::getMetatag(const string& name, string& value) const } } +string Reader::getName() const +{ + string value; + this->getMetatag("Name", value); + return value; +} + string Reader::getTitle() const { string value; @@ -302,17 +309,24 @@ string Reader::getTitle() const return value; } -string Reader::getName() const +string Reader::getCreator() const { string value; - this->getMetatag("Name", value); + this->getMetatag("Creator", value); return value; } -string Reader::getTags() const +string Reader::getPublisher() const { string value; - this->getMetatag("Tags", value); + this->getMetatag("Publisher", value); + return value; +} + +string Reader::getDate() const +{ + string value; + this->getMetatag("Date", value); return value; } @@ -336,24 +350,10 @@ string Reader::getLanguage() const return value; } -string Reader::getDate() const +string Reader::getTags() const { string value; - this->getMetatag("Date", value); - return value; -} - -string Reader::getCreator() const -{ - string value; - this->getMetatag("Creator", value); - return value; -} - -string Reader::getPublisher() const -{ - string value; - this->getMetatag("Publisher", value); + this->getMetatag("Tags", value); return value; } From 1245d4e4678df6d7ddf89480589a3ebe5595975a Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 12 Sep 2019 15:26:53 +0200 Subject: [PATCH 2/8] Use a macro to get the content of the metadata. --- src/reader.cpp | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/src/reader.cpp b/src/reader.cpp index 7c3e18985..b62523146 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -289,11 +289,11 @@ bool Reader::getMetatag(const string& name, string& value) const } } +#define METATAG(NAME) std::string v; getMetatag(NAME, v); return v; + string Reader::getName() const { - string value; - this->getMetatag("Name", value); - return value; + METATAG("Name") } string Reader::getTitle() const @@ -311,23 +311,17 @@ string Reader::getTitle() const string Reader::getCreator() const { - string value; - this->getMetatag("Creator", value); - return value; + METATAG("Creator") } string Reader::getPublisher() const { - string value; - this->getMetatag("Publisher", value); - return value; + METATAG("Publisher") } string Reader::getDate() const { - string value; - this->getMetatag("Date", value); - return value; + METATAG("Date") } string Reader::getDescription() const @@ -345,17 +339,14 @@ string Reader::getDescription() const string Reader::getLanguage() const { - string value; - this->getMetatag("Language", value); - return value; + METATAG("Language") } string Reader::getTags() const { - string value; - this->getMetatag("Tags", value); - return value; + METATAG("Tags") } +#undef METATAG string Reader::getOrigId() const { From bd91e89785445e466fdef2cade92d33a31e3f2cf Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 12 Sep 2019 15:33:07 +0200 Subject: [PATCH 3/8] Add missing method to get the zim metadata. According to https://wiki.openzim.org/wiki/Metadata --- include/reader.h | 42 ++++++++++++++++++++++++++++++++++++++++++ src/reader.cpp | 30 ++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/include/reader.h b/include/reader.h index c65859d72..9c0286792 100644 --- a/include/reader.h +++ b/include/reader.h @@ -205,6 +205,13 @@ class Reader */ string getDescription() const; + /** + * Get the long description of the zim file. + * + * @return The long description of the zim file as specifed in the zim metadata. + */ + string getLongDescription() const; + /** * Get the language of the zim file. * @@ -212,6 +219,13 @@ class Reader */ string getLanguage() const; + /** + * Get the license of the zim file. + * + * @return The license of the zim file as specified in the zim metadata. + */ + string getLicense() const; + /** * Get the tags of the zim file. * @@ -219,6 +233,34 @@ class Reader */ string getTags() const; + /** + * Get the relations of the zim file. + * + * @return The relation of the zim file as specified in the zim metadata. + */ + string getRelation() const; + + /** + * Get the flavour of the zim file. + * + * @return The flavour of the zim file as specified in the zim metadata. + */ + string getFlavour() const; + + /** + * Get the source of the zim file. + * + * @return The source of the zim file as specified in the zim metadata. + */ + string getSource() const; + + /** + * Get the scraper of the zim file. + * + * @return The scraper of the zim file as specified in the zim metadata. + */ + string getScraper() const; + /** * Get the origId of the zim file. * diff --git a/src/reader.cpp b/src/reader.cpp index b62523146..7e7dc79e0 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -337,15 +337,45 @@ string Reader::getDescription() const return value; } +string Reader::getLongDescription() const +{ + METATAG("LongDescription") +} + string Reader::getLanguage() const { METATAG("Language") } +string Reader::getLicense() const +{ + METATAG("License") +} + string Reader::getTags() const { METATAG("Tags") } + +string Reader::getRelation() const +{ + METATAG("Relation") +} + +string Reader::getFlavour() const +{ + METATAG("Flavour") +} + +string Reader::getSource() const +{ + METATAG("Source") +} + +string Reader::getScraper() const +{ + METATAG("Scraper") +} #undef METATAG string Reader::getOrigId() const From 157c1c939c95e17e8da1d5cf93ea57acbf856823 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 16 Sep 2019 09:42:10 +0200 Subject: [PATCH 4/8] Add a string tool to join a list of strings together. --- include/tools/stringTools.h | 1 + src/tools/stringTools.cpp | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/tools/stringTools.h b/include/tools/stringTools.h index f106d7893..bfb073c27 100644 --- a/include/tools/stringTools.h +++ b/include/tools/stringTools.h @@ -47,6 +47,7 @@ std::vector split(const std::string&, const std::string&); std::vector split(const char*, const char*); std::vector split(const std::string&, const char*); std::vector split(const char*, const std::string&); +std::string join(const std::vector& list, const std::string& sep); std::string ucAll(const std::string& word); std::string lcAll(const std::string& word); diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index c1025b522..f3d206e41 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -298,6 +298,21 @@ std::vector kiwix::split(const std::string& lhs, const char* rhs) return split(lhs.c_str(), rhs); } +std::string kiwix::join(const std::vector& list, const std::string& sep) +{ + std::stringstream ss; + bool first = true; + for (auto& s:list) { + if (first) { + ss << sep; + first = false; + } + ss << s; + } + return ss.str(); +} + + std::string kiwix::ucFirst(const std::string& word) { if (word.empty()) { From 660d5d7fb760eacce7322bd1521995a368a81e35 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 16 Sep 2019 10:30:39 +0200 Subject: [PATCH 5/8] [API Change] Rename getMatatag to getMetadata. --- include/reader.h | 2 +- src/reader.cpp | 38 +++++++++++++++++++------------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/reader.h b/include/reader.h index 9c0286792..77d4e9793 100644 --- a/include/reader.h +++ b/include/reader.h @@ -158,7 +158,7 @@ class Reader * @param[out] value The value will be set to the content of the metadata. * @return True if it was possible to get the content of the metadata. */ - bool getMetatag(const string& name, string& value) const; + bool getMetadata(const string& name, string& value) const; /** * Get the name of the zim file. diff --git a/src/reader.cpp b/src/reader.cpp index 7e7dc79e0..0a02985e7 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -278,7 +278,7 @@ string Reader::getZimFilePath() const return this->zimFilePath; } /* Return a metatag value */ -bool Reader::getMetatag(const string& name, string& value) const +bool Reader::getMetadata(const string& name, string& value) const { try { auto entry = getEntryFromPath("M/"+name); @@ -289,17 +289,17 @@ bool Reader::getMetatag(const string& name, string& value) const } } -#define METATAG(NAME) std::string v; getMetatag(NAME, v); return v; +#define METADATA(NAME) std::string v; getMetadata(NAME, v); return v; string Reader::getName() const { - METATAG("Name") + METADATA("Name") } string Reader::getTitle() const { string value; - this->getMetatag("Title", value); + this->getMetadata("Title", value); if (value.empty()) { value = getLastPathElement(zimFileHandler->getFilename()); std::replace(value.begin(), value.end(), '_', ' '); @@ -311,27 +311,27 @@ string Reader::getTitle() const string Reader::getCreator() const { - METATAG("Creator") + METADATA("Creator") } string Reader::getPublisher() const { - METATAG("Publisher") + METADATA("Publisher") } string Reader::getDate() const { - METATAG("Date") + METADATA("Date") } string Reader::getDescription() const { string value; - this->getMetatag("Description", value); + this->getMetadata("Description", value); /* Mediawiki Collection tends to use the "Subtitle" name */ if (value.empty()) { - this->getMetatag("Subtitle", value); + this->getMetadata("Subtitle", value); } return value; @@ -339,49 +339,49 @@ string Reader::getDescription() const string Reader::getLongDescription() const { - METATAG("LongDescription") + METADATA("LongDescription") } string Reader::getLanguage() const { - METATAG("Language") + METADATA("Language") } string Reader::getLicense() const { - METATAG("License") + METADATA("License") } string Reader::getTags() const { - METATAG("Tags") + METADATA("Tags") } string Reader::getRelation() const { - METATAG("Relation") + METADATA("Relation") } string Reader::getFlavour() const { - METATAG("Flavour") + METADATA("Flavour") } string Reader::getSource() const { - METATAG("Source") + METADATA("Source") } string Reader::getScraper() const { - METATAG("Scraper") + METADATA("Scraper") } -#undef METATAG +#undef METADATA string Reader::getOrigId() const { string value; - this->getMetatag("startfileuid", value); + this->getMetadata("startfileuid", value); if (value.empty()) { return ""; } From 2a6772b76d5c97e2e3154c51c0d7514e2b1e153e Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 16 Sep 2019 10:45:41 +0200 Subject: [PATCH 6/8] [API Change] Convert tags to the new convention. Use the new convention describe here : https://wiki.openzim.org/wiki/Tags --- include/reader.h | 6 ++-- src/reader.cpp | 47 +++++++++++++++++++++++++++++-- test/meson.build | 3 +- test/tagParsing.cpp | 68 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 test/tagParsing.cpp diff --git a/include/reader.h b/include/reader.h index 77d4e9793..3dcdc441d 100644 --- a/include/reader.h +++ b/include/reader.h @@ -229,9 +229,11 @@ class Reader /** * Get the tags of the zim file. * - * @return The tags of the zim file as specified in the zim metadata. + * @param original If true, return the original tags as specified in the zim metadata. + * Else, try to convert it to the new 'normalized' format. + * @return The tags of the zim file. */ - string getTags() const; + string getTags(bool original=false) const; /** * Get the relations of the zim file. diff --git a/src/reader.cpp b/src/reader.cpp index 0a02985e7..43b7a1491 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -352,9 +352,52 @@ string Reader::getLicense() const METADATA("License") } -string Reader::getTags() const +std::vector convertTags(const std::string& tags_str) { - METADATA("Tags") + auto tags = split(tags_str, ";"); + std::vector tagsList; + bool picSeen(false), vidSeen(false), detSeen(false), indexSeen(false); + for (auto tag: tags) { + picSeen |= (tag == "nopic" || startsWith(tag, "_pictures:")); + vidSeen |= (tag == "novid" || startsWith(tag, "_videos:")); + detSeen |= (tag == "nodet" || startsWith(tag, "_details:")); + indexSeen |= startsWith(tag, "_ftindex"); + if (tag == "nopic") { + tagsList.push_back("_pictures:no"); + } else if (tag == "novid") { + tagsList.push_back("_videos:no"); + } else if (tag == "nodet") { + tagsList.push_back("_details:no"); + } else if (tag == "_ftindex") { + tagsList.push_back("_ftindex:yes"); + } else { + tagsList.push_back(tag); + } + } + if (!indexSeen) { + tagsList.push_back("_ftindex:no"); + } + if (!picSeen) { + tagsList.push_back("_pictures:yes"); + } + if (!vidSeen) { + tagsList.push_back("_videos:yes"); + } + if (!detSeen) { + tagsList.push_back("_details:yes"); + } + return tagsList; +} + +string Reader::getTags(bool original) const +{ + string tags_str; + getMetadata("Tags", tags_str); + if (original) { + return tags_str; + } + auto tags = convertTags(tags_str); + return join(tags, ";"); } string Reader::getRelation() const diff --git a/test/meson.build b/test/meson.build index ad537ad3f..2777d1ea5 100644 --- a/test/meson.build +++ b/test/meson.build @@ -3,7 +3,8 @@ tests = [ 'parseUrl', 'library', - 'regex' + 'regex', + 'tagParsing' ] diff --git a/test/tagParsing.cpp b/test/tagParsing.cpp new file mode 100644 index 000000000..c58c1dbe4 --- /dev/null +++ b/test/tagParsing.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2019 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "gtest/gtest.h" +#include +#include + +namespace kiwix { +std::vector convertTags(const std::string& tags); +std::string getTagValueFromTagList(const std::vector& tagList, const std::string& tagName); +}; + +using namespace kiwix; +#define parse_tag getTagValueFromTagList + +namespace +{ +TEST(ParseTagTest, convert) +{ + { + std::string tagStr = ""; + std::vector tagList = {"_ftindex:no", "_pictures:yes", "_videos:yes", "_details:yes"}; + ASSERT_EQ(convertTags(tagStr), tagList); + } + { + std::string tagStr = "_category:foo;bar"; + std::vector tagList = {"_category:foo", "bar", "_ftindex:no", "_pictures:yes", "_videos:yes", "_details:yes"}; + ASSERT_EQ(convertTags(tagStr), tagList); + } + { + std::string tagStr = "_ftindex:no;_pictures:yes;_videos:yes;_details:yes;_category:foo;bar"; + std::vector tagList = {"_ftindex:no", "_pictures:yes", "_videos:yes", "_details:yes", "_category:foo", "bar"}; + ASSERT_EQ(convertTags(tagStr), tagList); + } + { + std::string tagStr = "_ftindex:yes;_pictures:no;_videos:no;_details:no;_category:foo;bar"; + std::vector tagList = {"_ftindex:yes", "_pictures:no", "_videos:no", "_details:no", "_category:foo", "bar"}; + ASSERT_EQ(convertTags(tagStr), tagList); + } + { + std::string tagStr = "_ftindex;nopic;novid;nodet;foo;bar"; + std::vector tagList = {"_ftindex:yes", "_pictures:no", "_videos:no", "_details:no", "foo", "bar"}; + ASSERT_EQ(convertTags(tagStr), tagList); + } +} + +}; +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From 6ee174b54690b9063159ba658fb90b85a9d08f64 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 16 Sep 2019 10:48:36 +0200 Subject: [PATCH 7/8] Add a method to get the value of a specific tag. Fix #258 --- include/reader.h | 21 +++++++++++++++++++++ src/reader.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++ test/tagParsing.cpp | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) diff --git a/include/reader.h b/include/reader.h index 3dcdc441d..dd9f39d94 100644 --- a/include/reader.h +++ b/include/reader.h @@ -235,6 +235,27 @@ class Reader */ string getTags(bool original=false) const; + /** + * Get the value (as a string) of a specific tag. + * + * According to https://wiki.openzim.org/wiki/Tags + * + * @return The value of the specified tag. + * @throw std::out_of_range if the specified tag is not found. + */ + string getTagStr(const std::string& tagName) const; + + /** + * Get the boolean value of a specific tag. + * + * According to https://wiki.openzim.org/wiki/Tags + * + * @return The boolean value of the specified tag. + * @throw std::out_of_range if the specified tag is not found. + * std::domain_error if the value of the tag cannot be convert to bool. + */ + bool getTagBool(const std::string& tagName) const; + /** * Get the relations of the zim file. * diff --git a/src/reader.cpp b/src/reader.cpp index 43b7a1491..87cedc550 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -400,6 +400,48 @@ string Reader::getTags(bool original) const return join(tags, ";"); } +string getTagValueFromTagList(const std::vector& tagList, const std::string& tagName) +{ + for (auto tag: tagList) { + if (tag[0] == '_') { + auto delimPos = tag.find(':'); + if (delimPos == string::npos) { + // No delimiter... what to do ? + continue; + } + auto cTagName = tag.substr(1, delimPos-1); + auto cTagValue = tag.substr(delimPos+1); + if (cTagName == tagName) { + return cTagValue; + } + } + } + std::stringstream ss; + ss << tagName << " cannot be found"; + throw std::out_of_range(ss.str()); +} + +string Reader::getTagStr(const std::string& tagName) const +{ + string tags_str; + getMetadata("Tags", tags_str); + return getTagValueFromTagList(convertTags(tags_str), tagName); +} + +bool Reader::getTagBool(const std::string& tagName) const +{ + auto tagValue = getTagStr(tagName); + if (tagValue == "yes") { + return true; + } else if (tagValue == "no") { + return false; + } else { + std::stringstream ss; + ss << "Tag value '" << tagValue << "' for " << tagName << " cannot be converted to bool."; + throw std::domain_error(ss.str()); + } +} + string Reader::getRelation() const { METADATA("Relation") diff --git a/test/tagParsing.cpp b/test/tagParsing.cpp index c58c1dbe4..9e90519b5 100644 --- a/test/tagParsing.cpp +++ b/test/tagParsing.cpp @@ -60,6 +60,40 @@ TEST(ParseTagTest, convert) } } +TEST(ParseTagTest, valid) +{ + std::string tagStr = "_ftindex:yes;_pictures:no;_videos:no;_details:yes;_category:foo;bar"; + auto tagList = convertTags(tagStr); + + ASSERT_EQ(parse_tag(tagList, "ftindex"), "yes"); + ASSERT_EQ(parse_tag(tagList, "pictures"), "no"); + ASSERT_EQ(parse_tag(tagList, "category"), "foo"); + ASSERT_EQ(parse_tag(tagList, "details"), "yes"); + ASSERT_THROW(parse_tag(tagList, "detail"), std::out_of_range); +} + +TEST(ParseTagTest, compat) +{ + std::string tagStr = "_ftindex;nopic;foo;bar"; + auto tagList = convertTags(tagStr); + + ASSERT_EQ(parse_tag(tagList, "ftindex"), "yes"); + ASSERT_EQ(parse_tag(tagList, "pictures"), "no"); + ASSERT_EQ(parse_tag(tagList, "videos"), "yes"); + ASSERT_EQ(parse_tag(tagList, "details"), "yes"); +} + +TEST(ParseTagTest, invalid) +{ + std::string tagStr = "_ftindex:y;_pictures;_videos:;_details:yes;_details:no;_category:foo;bar"; + auto tagList = convertTags(tagStr); + + ASSERT_EQ(parse_tag(tagList, "ftindex"), "y"); + ASSERT_EQ(parse_tag(tagList, "pictures"), "yes"); + ASSERT_EQ(parse_tag(tagList, "videos"), ""); + ASSERT_EQ(parse_tag(tagList, "details"), "yes"); +} + }; int main(int argc, char** argv) { From 2f91149da37edaf32e6dd3a095bf43f82a0faa64 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 16 Sep 2019 10:48:56 +0200 Subject: [PATCH 8/8] Update .gitignore. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7f993beda..f4a4218ae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .idea/ *.swp +subprojects/googletest-release*