Merge pull request #967 from kiwix/opdsFilters

This commit is contained in:
Matthieu Gautier 2023-07-26 14:00:06 +02:00 committed by GitHub
commit 9c91fc7369
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 348 additions and 60 deletions

View File

@ -6,6 +6,8 @@ headers = [
'manager.h', 'manager.h',
'libxml_dumper.h', 'libxml_dumper.h',
'opds_dumper.h', 'opds_dumper.h',
'library_dumper.h',
'html_dumper.h',
'downloader.h', 'downloader.h',
'search_renderer.h', 'search_renderer.h',
'server.h', 'server.h',

View File

@ -26,6 +26,9 @@
#include <cstdint> #include <cstdint>
namespace kiwix { namespace kiwix {
typedef std::pair<std::string, std::string> LangNameCodePair;
typedef std::vector<LangNameCodePair> FeedLanguages;
typedef std::vector<std::string> FeedCategories;
/** /**
* Return the current directory. * Return the current directory.
@ -226,5 +229,28 @@ std::string getBestPublicIp();
*/ */
std::string beautifyFileSize(uint64_t number); std::string beautifyFileSize(uint64_t number);
/**
* Load languages stored in an OPDS stream.
*
* @param content the OPDS stream.
* @return vector containing pairs of language code and their corresponding full language name.
*/
FeedLanguages readLanguagesFromFeed(const std::string& content);
/**
* Load categories stored in an OPDS stream .
*
* @param content the OPDS stream.
* @return vector containing category strings.
*/
FeedCategories readCategoriesFromFeed(const std::string& content);
/**
* Retrieve the full language name associated with a given ISO 639-3 language code.
*
* @param lang ISO 639-3 language code.
* @return full language name.
*/
std::string getLanguageSelfName(const std::string& lang);
} }
#endif // KIWIX_TOOLS_H #endif // KIWIX_TOOLS_H

View File

@ -23,65 +23,6 @@ void LibraryDumper::setOpenSearchInfo(int totalResults, int startIndex, int coun
m_count = count; m_count = count;
} }
namespace {
std::map<std::string, std::string> iso639_3 = {
{"atj", "atikamekw"},
{"azb", "آذربایجان دیلی"},
{"bcl", "central bikol"},
{"bgs", "tagabawa"},
{"bxr", "буряад хэлэн"},
{"cbk", "chavacano"},
{"cdo", "閩東語"},
{"dag", "Dagbani"},
{"diq", "dimli"},
{"dty", "डोटेली"},
{"eml", "emiliân-rumagnōl"},
{"fbs", "српскохрватски"},
{"guw", "Gungbe"},
{"hbs", "srpskohrvatski"},
{"ido", "ido"},
{"kbp", "kabɩ"},
{"kld", "Gamilaraay"},
{"lbe", "лакку маз"},
{"lbj", "ལ་དྭགས་སྐད་"},
{"map", "Austronesian"},
{"mhr", "марий йылме"},
{"mnw", "ဘာသာမန်"},
{"myn", "mayan"},
{"nah", "nahuatl"},
{"nai", "north American Indian"},
{"nds", "plattdütsch"},
{"nrm", "bhasa narom"},
{"olo", "livvi"},
{"pih", "Pitcairn-Norfolk"},
{"pnb", "Western Panjabi"},
{"rmr", "Caló"},
{"rmy", "romani shib"},
{"roa", "romance languages"},
{"twi", "twi"},
};
std::once_flag fillLanguagesFlag;
void fillLanguagesMap()
{
for (auto icuLangPtr = icu::Locale::getISOLanguages(); *icuLangPtr != NULL; ++icuLangPtr) {
const ICULanguageInfo lang(*icuLangPtr);
iso639_3.insert({lang.iso3Code(), lang.selfName()});
}
}
std::string getLanguageSelfName(const std::string& lang) {
const auto itr = iso639_3.find(lang);
if (itr != iso639_3.end()) {
return itr->second;
}
return lang;
};
} // unnamed namespace
kainjow::mustache::list LibraryDumper::getCategoryData() const kainjow::mustache::list LibraryDumper::getCategoryData() const
{ {
const auto now = gen_date_str(); const auto now = gen_date_str();
@ -102,7 +43,6 @@ kainjow::mustache::list LibraryDumper::getLanguageData() const
{ {
const auto now = gen_date_str(); const auto now = gen_date_str();
kainjow::mustache::list languageData; kainjow::mustache::list languageData;
std::call_once(fillLanguagesFlag, fillLanguagesMap);
for ( const auto& langAndBookCount : library->getBooksLanguagesWithCounts() ) { for ( const auto& langAndBookCount : library->getBooksLanguagesWithCounts() ) {
const std::string languageCode = langAndBookCount.first; const std::string languageCode = langAndBookCount.first;
const int bookCount = langAndBookCount.second; const int bookCount = langAndBookCount.second;

View File

@ -17,6 +17,8 @@ kiwix_sources = [
'tools/regexTools.cpp', 'tools/regexTools.cpp',
'tools/stringTools.cpp', 'tools/stringTools.cpp',
'tools/networkTools.cpp', 'tools/networkTools.cpp',
'tools/opdsParsingTools.cpp',
'tools/languageTools.cpp',
'tools/otherTools.cpp', 'tools/otherTools.cpp',
'tools/archiveTools.cpp', 'tools/archiveTools.cpp',
'kiwixserve.cpp', 'kiwixserve.cpp',

View File

@ -0,0 +1,74 @@
#include "tools.h"
#include "stringTools.h"
#include <mutex>
namespace kiwix
{
namespace
{
// These mappings are not provided by the ICU library, any such mappings can be manually added here
std::map<std::string, std::string> iso639_3 = {
{"atj", "atikamekw"},
{"azb", "آذربایجان دیلی"},
{"bcl", "central bikol"},
{"bgs", "tagabawa"},
{"bxr", "буряад хэлэн"},
{"cbk", "chavacano"},
{"cdo", "閩東語"},
{"dag", "Dagbani"},
{"diq", "dimli"},
{"dty", "डोटेली"},
{"eml", "emiliân-rumagnōl"},
{"fbs", "српскохрватски"},
{"guw", "Gungbe"},
{"hbs", "srpskohrvatski"},
{"ido", "ido"},
{"kbp", "kabɩ"},
{"kld", "Gamilaraay"},
{"lbe", "лакку маз"},
{"lbj", "ལ་དྭགས་སྐད་"},
{"map", "Austronesian"},
{"mhr", "марий йылме"},
{"mnw", "ဘာသာမန်"},
{"myn", "mayan"},
{"nah", "nahuatl"},
{"nai", "north American Indian"},
{"nds", "plattdütsch"},
{"nrm", "bhasa narom"},
{"olo", "livvi"},
{"pih", "Pitcairn-Norfolk"},
{"pnb", "Western Panjabi"},
{"rmr", "Caló"},
{"rmy", "romani shib"},
{"roa", "romance languages"},
{"twi", "twi"},
// ICU for Ubuntu versions <= focal (20.04) returns "" for the language code ""
// unlike the later versions - which returns "und". We map this value to "Undetermined" for a common ground.
{"", "Undetermined"},
};
std::once_flag fillLanguagesFlag;
void fillLanguagesMap()
{
for (auto icuLangPtr = icu::Locale::getISOLanguages(); *icuLangPtr != NULL; ++icuLangPtr) {
const kiwix::ICULanguageInfo lang(*icuLangPtr);
iso639_3.insert({lang.iso3Code(), lang.selfName()});
}
}
} // unnamed namespace
std::string getLanguageSelfName(const std::string& lang)
{
std::call_once(fillLanguagesFlag, fillLanguagesMap);
const auto itr = iso639_3.find(lang);
if (itr != iso639_3.end()) {
return itr->second;
}
return lang;
};
} // namespace kiwix

View File

@ -0,0 +1,70 @@
#include "tools.h"
#include <pugixml.hpp>
namespace kiwix
{
namespace
{
#define VALUE(name) entryNode.child(name).child_value()
FeedLanguages parseLanguages(const pugi::xml_document& doc)
{
pugi::xml_node feedNode = doc.child("feed");
FeedLanguages langs;
for (pugi::xml_node entryNode = feedNode.child("entry"); entryNode;
entryNode = entryNode.next_sibling("entry")) {
auto title = VALUE("title");
auto code = VALUE("dc:language");
langs.push_back({code, title});
}
return langs;
}
FeedCategories parseCategories(const pugi::xml_document& doc)
{
pugi::xml_node feedNode = doc.child("feed");
FeedCategories categories;
for (pugi::xml_node entryNode = feedNode.child("entry"); entryNode;
entryNode = entryNode.next_sibling("entry")) {
auto title = VALUE("title");
categories.push_back(title);
}
return categories;
}
} // unnamed namespace
FeedLanguages readLanguagesFromFeed(const std::string& content)
{
pugi::xml_document doc;
pugi::xml_parse_result result
= doc.load_buffer((void*)content.data(), content.size());
if (result) {
auto langs = parseLanguages(doc);
return langs;
}
return FeedLanguages();
}
FeedCategories readCategoriesFromFeed(const std::string& content)
{
pugi::xml_document doc;
pugi::xml_parse_result result
= doc.load_buffer((void*)content.data(), content.size());
FeedCategories categories;
if (result) {
categories = parseCategories(doc);
return categories;
}
return categories;
}
} // namespace kiwix

41
test/languageTools.cpp Normal file
View File

@ -0,0 +1,41 @@
/*
* Copyright (C) 2023 Nikhil Tanwar (2002nikhiltanwar@gmail.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "gtest/gtest.h"
#include "../include/tools.h"
namespace
{
TEST(LanguageToolsTest, englishTest)
{
EXPECT_EQ(kiwix::getLanguageSelfName("eng"), "English");
}
TEST(LanguageToolsTest, manualValuesTest)
{
EXPECT_EQ(kiwix::getLanguageSelfName("dty"), "डोटेली");
}
TEST(LanguageToolsTest, emptyStringTest)
{
EXPECT_EQ(kiwix::getLanguageSelfName(""), "Undetermined");
}
}

View File

@ -5,6 +5,8 @@ tests = [
'stringTools', 'stringTools',
'pathTools', 'pathTools',
'otherTools', 'otherTools',
'opdsParsingTools',
'languageTools',
'kiwixserve', 'kiwixserve',
'book', 'book',
'manager', 'manager',

131
test/opdsParsingTools.cpp Normal file
View File

@ -0,0 +1,131 @@
/*
* Copyright (C) 2023 Nikhil Tanwar (2002nikhiltanwar@gmail.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "gtest/gtest.h"
#include "../include/tools.h"
typedef kiwix::FeedLanguages FeedLanguages;
typedef kiwix::FeedCategories FeedCategories;
namespace
{
const char sampleLanguageOpdsStream[] = R"(
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom"
xmlns:dc="http://purl.org/dc/terms/"
xmlns:opds="https://specs.opds.io/opds-1.2"
xmlns:thr="http://purl.org/syndication/thread/1.0">
<id>1e587935-0f7b-dad6-eddc-ef3fafd4c3ed</id>
<link rel="self"
href="/catalog/v2/languages"
type="application/atom+xml;profile=opds-catalog;kind=navigation"/>
<link rel="start"
href="/catalog/v2/root.xml"
type="application/atom+xml;profile=opds-catalog;kind=navigation"/>
<title>List of languages</title>
<updated>2023-07-11T15:35:09Z</updated>
<entry>
<title>Abkhazian</title>
<dc:language>abk</dc:language>
<thr:count>3</thr:count>
<link rel="subsection"
href="/catalog/v2/entries?lang=abk"
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
<updated>2023-07-11T15:35:09Z</updated>
<id>2e4d9a1c-9750-0418-8124-a0c663e206f7</id>
</entry>
<entry>
<title>isiZulu</title>
<dc:language>zul</dc:language>
<thr:count>4</thr:count>
<link rel="subsection"
href="/catalog/v2/entries?lang=zul"
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
<updated>2023-07-11T15:35:09Z</updated>
<id>76eec223-994d-9b95-e309-baee06e585b0</id>
</entry>
</feed>
)";
const char sampleCategoriesOpdsStream[] = R"(
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom"
xmlns:opds="https://specs.opds.io/opds-1.2">
<id>231da20c-0fe0-7345-11b2-d29f50364108</id>
<link rel="self"
href="/catalog/v2/categories"
type="application/atom+xml;profile=opds-catalog;kind=navigation"/>
<link rel="start"
href="/catalog/v2/root.xml"
type="application/atom+xml;profile=opds-catalog;kind=navigation"/>
<title>List of categories</title>
<updated>2023-07-11T15:35:09Z</updated>
<entry>
<title>gutenberg</title>
<link rel="subsection"
href="/catalog/v2/entries?category=gutenberg"
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
<updated>2023-07-11T15:35:09Z</updated>
<id>401dbe68-2f7a-5503-b431-054801c30bab</id>
<content type="text">All entries with category of 'gutenberg'.</content>
</entry>
<entry>
<title>iFixit</title>
<link rel="subsection"
href="/catalog/v2/entries?category=iFixit"
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
<updated>2023-07-11T15:35:09Z</updated>
<id>c18e5459-af23-5fbf-0622-ff271bd9a5ad</id>
<content type="text">All entries with category of 'iFixit'.</content>
</entry>
<entry>
<title>wikivoyage</title>
<link rel="subsection"
href="/catalog/v2/entries?category=wikivoyage"
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
<updated>2023-07-11T15:35:09Z</updated>
<id>9a75be6c-7a35-6f52-1a69-bee9ad248459</id>
<content type="text">All entries with category of 'wikivoyage'.</content>
</entry>
<entry>
<title>wiktionary</title>
<link rel="subsection"
href="/catalog/v2/entries?category=wiktionary"
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
<updated>2023-07-11T15:35:09Z</updated>
<id>7adb9f1a-73d7-0391-1238-d2e2c300ddaa</id>
<content type="text">All entries with category of 'wiktionary'.</content>
</entry>
</feed>
)";
TEST(OpdsParsingTest, languageTest)
{
FeedLanguages expectedLanguagesFromFeed = {{"abk", "Abkhazian"}, {"zul", "isiZulu"}};
EXPECT_EQ(kiwix::readLanguagesFromFeed(sampleLanguageOpdsStream), expectedLanguagesFromFeed);
}
TEST(OpdsParsingTest, categoryTest)
{
FeedCategories expectedCategoriesFromFeed = {"gutenberg", "iFixit", "wikivoyage", "wiktionary"};
EXPECT_EQ(kiwix::readCategoriesFromFeed(sampleCategoriesOpdsStream), expectedCategoriesFromFeed);
}
}