mirror of https://github.com/kiwix/libkiwix.git
Merge pull request #967 from kiwix/opdsFilters
This commit is contained in:
commit
9c91fc7369
|
@ -6,6 +6,8 @@ headers = [
|
|||
'manager.h',
|
||||
'libxml_dumper.h',
|
||||
'opds_dumper.h',
|
||||
'library_dumper.h',
|
||||
'html_dumper.h',
|
||||
'downloader.h',
|
||||
'search_renderer.h',
|
||||
'server.h',
|
||||
|
|
|
@ -26,6 +26,9 @@
|
|||
#include <cstdint>
|
||||
|
||||
namespace kiwix {
|
||||
typedef std::pair<std::string, std::string> LangNameCodePair;
|
||||
typedef std::vector<LangNameCodePair> FeedLanguages;
|
||||
typedef std::vector<std::string> FeedCategories;
|
||||
|
||||
/**
|
||||
* Return the current directory.
|
||||
|
@ -226,5 +229,28 @@ std::string getBestPublicIp();
|
|||
*/
|
||||
std::string beautifyFileSize(uint64_t number);
|
||||
|
||||
/**
|
||||
* Load languages stored in an OPDS stream.
|
||||
*
|
||||
* @param content the OPDS stream.
|
||||
* @return vector containing pairs of language code and their corresponding full language name.
|
||||
*/
|
||||
FeedLanguages readLanguagesFromFeed(const std::string& content);
|
||||
|
||||
/**
|
||||
* Load categories stored in an OPDS stream .
|
||||
*
|
||||
* @param content the OPDS stream.
|
||||
* @return vector containing category strings.
|
||||
*/
|
||||
FeedCategories readCategoriesFromFeed(const std::string& content);
|
||||
|
||||
/**
|
||||
* Retrieve the full language name associated with a given ISO 639-3 language code.
|
||||
*
|
||||
* @param lang ISO 639-3 language code.
|
||||
* @return full language name.
|
||||
*/
|
||||
std::string getLanguageSelfName(const std::string& lang);
|
||||
}
|
||||
#endif // KIWIX_TOOLS_H
|
||||
|
|
|
@ -23,65 +23,6 @@ void LibraryDumper::setOpenSearchInfo(int totalResults, int startIndex, int coun
|
|||
m_count = count;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
std::map<std::string, std::string> iso639_3 = {
|
||||
{"atj", "atikamekw"},
|
||||
{"azb", "آذربایجان دیلی"},
|
||||
{"bcl", "central bikol"},
|
||||
{"bgs", "tagabawa"},
|
||||
{"bxr", "буряад хэлэн"},
|
||||
{"cbk", "chavacano"},
|
||||
{"cdo", "閩東語"},
|
||||
{"dag", "Dagbani"},
|
||||
{"diq", "dimli"},
|
||||
{"dty", "डोटेली"},
|
||||
{"eml", "emiliân-rumagnōl"},
|
||||
{"fbs", "српскохрватски"},
|
||||
{"guw", "Gungbe"},
|
||||
{"hbs", "srpskohrvatski"},
|
||||
{"ido", "ido"},
|
||||
{"kbp", "kabɩyɛ"},
|
||||
{"kld", "Gamilaraay"},
|
||||
{"lbe", "лакку маз"},
|
||||
{"lbj", "ལ་དྭགས་སྐད་"},
|
||||
{"map", "Austronesian"},
|
||||
{"mhr", "марий йылме"},
|
||||
{"mnw", "ဘာသာမန်"},
|
||||
{"myn", "mayan"},
|
||||
{"nah", "nahuatl"},
|
||||
{"nai", "north American Indian"},
|
||||
{"nds", "plattdütsch"},
|
||||
{"nrm", "bhasa narom"},
|
||||
{"olo", "livvi"},
|
||||
{"pih", "Pitcairn-Norfolk"},
|
||||
{"pnb", "Western Panjabi"},
|
||||
{"rmr", "Caló"},
|
||||
{"rmy", "romani shib"},
|
||||
{"roa", "romance languages"},
|
||||
{"twi", "twi"},
|
||||
};
|
||||
|
||||
std::once_flag fillLanguagesFlag;
|
||||
|
||||
void fillLanguagesMap()
|
||||
{
|
||||
for (auto icuLangPtr = icu::Locale::getISOLanguages(); *icuLangPtr != NULL; ++icuLangPtr) {
|
||||
const ICULanguageInfo lang(*icuLangPtr);
|
||||
iso639_3.insert({lang.iso3Code(), lang.selfName()});
|
||||
}
|
||||
}
|
||||
|
||||
std::string getLanguageSelfName(const std::string& lang) {
|
||||
const auto itr = iso639_3.find(lang);
|
||||
if (itr != iso639_3.end()) {
|
||||
return itr->second;
|
||||
}
|
||||
return lang;
|
||||
};
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
kainjow::mustache::list LibraryDumper::getCategoryData() const
|
||||
{
|
||||
const auto now = gen_date_str();
|
||||
|
@ -102,7 +43,6 @@ kainjow::mustache::list LibraryDumper::getLanguageData() const
|
|||
{
|
||||
const auto now = gen_date_str();
|
||||
kainjow::mustache::list languageData;
|
||||
std::call_once(fillLanguagesFlag, fillLanguagesMap);
|
||||
for ( const auto& langAndBookCount : library->getBooksLanguagesWithCounts() ) {
|
||||
const std::string languageCode = langAndBookCount.first;
|
||||
const int bookCount = langAndBookCount.second;
|
||||
|
|
|
@ -17,6 +17,8 @@ kiwix_sources = [
|
|||
'tools/regexTools.cpp',
|
||||
'tools/stringTools.cpp',
|
||||
'tools/networkTools.cpp',
|
||||
'tools/opdsParsingTools.cpp',
|
||||
'tools/languageTools.cpp',
|
||||
'tools/otherTools.cpp',
|
||||
'tools/archiveTools.cpp',
|
||||
'kiwixserve.cpp',
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
#include "tools.h"
|
||||
#include "stringTools.h"
|
||||
#include <mutex>
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
// These mappings are not provided by the ICU library, any such mappings can be manually added here
|
||||
std::map<std::string, std::string> iso639_3 = {
|
||||
{"atj", "atikamekw"},
|
||||
{"azb", "آذربایجان دیلی"},
|
||||
{"bcl", "central bikol"},
|
||||
{"bgs", "tagabawa"},
|
||||
{"bxr", "буряад хэлэн"},
|
||||
{"cbk", "chavacano"},
|
||||
{"cdo", "閩東語"},
|
||||
{"dag", "Dagbani"},
|
||||
{"diq", "dimli"},
|
||||
{"dty", "डोटेली"},
|
||||
{"eml", "emiliân-rumagnōl"},
|
||||
{"fbs", "српскохрватски"},
|
||||
{"guw", "Gungbe"},
|
||||
{"hbs", "srpskohrvatski"},
|
||||
{"ido", "ido"},
|
||||
{"kbp", "kabɩyɛ"},
|
||||
{"kld", "Gamilaraay"},
|
||||
{"lbe", "лакку маз"},
|
||||
{"lbj", "ལ་དྭགས་སྐད་"},
|
||||
{"map", "Austronesian"},
|
||||
{"mhr", "марий йылме"},
|
||||
{"mnw", "ဘာသာမန်"},
|
||||
{"myn", "mayan"},
|
||||
{"nah", "nahuatl"},
|
||||
{"nai", "north American Indian"},
|
||||
{"nds", "plattdütsch"},
|
||||
{"nrm", "bhasa narom"},
|
||||
{"olo", "livvi"},
|
||||
{"pih", "Pitcairn-Norfolk"},
|
||||
{"pnb", "Western Panjabi"},
|
||||
{"rmr", "Caló"},
|
||||
{"rmy", "romani shib"},
|
||||
{"roa", "romance languages"},
|
||||
{"twi", "twi"},
|
||||
// ICU for Ubuntu versions <= focal (20.04) returns "" for the language code ""
|
||||
// unlike the later versions - which returns "und". We map this value to "Undetermined" for a common ground.
|
||||
{"", "Undetermined"},
|
||||
};
|
||||
|
||||
std::once_flag fillLanguagesFlag;
|
||||
|
||||
void fillLanguagesMap()
|
||||
{
|
||||
for (auto icuLangPtr = icu::Locale::getISOLanguages(); *icuLangPtr != NULL; ++icuLangPtr) {
|
||||
const kiwix::ICULanguageInfo lang(*icuLangPtr);
|
||||
iso639_3.insert({lang.iso3Code(), lang.selfName()});
|
||||
}
|
||||
}
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
std::string getLanguageSelfName(const std::string& lang)
|
||||
{
|
||||
std::call_once(fillLanguagesFlag, fillLanguagesMap);
|
||||
const auto itr = iso639_3.find(lang);
|
||||
if (itr != iso639_3.end()) {
|
||||
return itr->second;
|
||||
}
|
||||
return lang;
|
||||
};
|
||||
|
||||
} // namespace kiwix
|
|
@ -0,0 +1,70 @@
|
|||
#include "tools.h"
|
||||
#include <pugixml.hpp>
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
#define VALUE(name) entryNode.child(name).child_value()
|
||||
FeedLanguages parseLanguages(const pugi::xml_document& doc)
|
||||
{
|
||||
pugi::xml_node feedNode = doc.child("feed");
|
||||
FeedLanguages langs;
|
||||
|
||||
for (pugi::xml_node entryNode = feedNode.child("entry"); entryNode;
|
||||
entryNode = entryNode.next_sibling("entry")) {
|
||||
auto title = VALUE("title");
|
||||
auto code = VALUE("dc:language");
|
||||
langs.push_back({code, title});
|
||||
}
|
||||
|
||||
return langs;
|
||||
}
|
||||
|
||||
FeedCategories parseCategories(const pugi::xml_document& doc)
|
||||
{
|
||||
pugi::xml_node feedNode = doc.child("feed");
|
||||
FeedCategories categories;
|
||||
|
||||
for (pugi::xml_node entryNode = feedNode.child("entry"); entryNode;
|
||||
entryNode = entryNode.next_sibling("entry")) {
|
||||
auto title = VALUE("title");
|
||||
categories.push_back(title);
|
||||
}
|
||||
|
||||
return categories;
|
||||
}
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
FeedLanguages readLanguagesFromFeed(const std::string& content)
|
||||
{
|
||||
pugi::xml_document doc;
|
||||
pugi::xml_parse_result result
|
||||
= doc.load_buffer((void*)content.data(), content.size());
|
||||
|
||||
if (result) {
|
||||
auto langs = parseLanguages(doc);
|
||||
return langs;
|
||||
}
|
||||
|
||||
return FeedLanguages();
|
||||
}
|
||||
|
||||
FeedCategories readCategoriesFromFeed(const std::string& content)
|
||||
{
|
||||
pugi::xml_document doc;
|
||||
pugi::xml_parse_result result
|
||||
= doc.load_buffer((void*)content.data(), content.size());
|
||||
|
||||
FeedCategories categories;
|
||||
if (result) {
|
||||
categories = parseCategories(doc);
|
||||
return categories;
|
||||
}
|
||||
|
||||
return categories;
|
||||
}
|
||||
|
||||
} // namespace kiwix
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Nikhil Tanwar (2002nikhiltanwar@gmail.com)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
|
||||
* NON-INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "../include/tools.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
TEST(LanguageToolsTest, englishTest)
|
||||
{
|
||||
EXPECT_EQ(kiwix::getLanguageSelfName("eng"), "English");
|
||||
}
|
||||
|
||||
TEST(LanguageToolsTest, manualValuesTest)
|
||||
{
|
||||
EXPECT_EQ(kiwix::getLanguageSelfName("dty"), "डोटेली");
|
||||
}
|
||||
|
||||
TEST(LanguageToolsTest, emptyStringTest)
|
||||
{
|
||||
EXPECT_EQ(kiwix::getLanguageSelfName(""), "Undetermined");
|
||||
}
|
||||
|
||||
}
|
|
@ -5,6 +5,8 @@ tests = [
|
|||
'stringTools',
|
||||
'pathTools',
|
||||
'otherTools',
|
||||
'opdsParsingTools',
|
||||
'languageTools',
|
||||
'kiwixserve',
|
||||
'book',
|
||||
'manager',
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Nikhil Tanwar (2002nikhiltanwar@gmail.com)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
|
||||
* NON-INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "../include/tools.h"
|
||||
typedef kiwix::FeedLanguages FeedLanguages;
|
||||
typedef kiwix::FeedCategories FeedCategories;
|
||||
|
||||
namespace
|
||||
{
|
||||
const char sampleLanguageOpdsStream[] = R"(
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom"
|
||||
xmlns:dc="http://purl.org/dc/terms/"
|
||||
xmlns:opds="https://specs.opds.io/opds-1.2"
|
||||
xmlns:thr="http://purl.org/syndication/thread/1.0">
|
||||
<id>1e587935-0f7b-dad6-eddc-ef3fafd4c3ed</id>
|
||||
<link rel="self"
|
||||
href="/catalog/v2/languages"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=navigation"/>
|
||||
<link rel="start"
|
||||
href="/catalog/v2/root.xml"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=navigation"/>
|
||||
<title>List of languages</title>
|
||||
<updated>2023-07-11T15:35:09Z</updated>
|
||||
|
||||
<entry>
|
||||
<title>Abkhazian</title>
|
||||
<dc:language>abk</dc:language>
|
||||
<thr:count>3</thr:count>
|
||||
<link rel="subsection"
|
||||
href="/catalog/v2/entries?lang=abk"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
|
||||
<updated>2023-07-11T15:35:09Z</updated>
|
||||
<id>2e4d9a1c-9750-0418-8124-a0c663e206f7</id>
|
||||
</entry>
|
||||
<entry>
|
||||
<title>isiZulu</title>
|
||||
<dc:language>zul</dc:language>
|
||||
<thr:count>4</thr:count>
|
||||
<link rel="subsection"
|
||||
href="/catalog/v2/entries?lang=zul"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
|
||||
<updated>2023-07-11T15:35:09Z</updated>
|
||||
<id>76eec223-994d-9b95-e309-baee06e585b0</id>
|
||||
</entry>
|
||||
</feed>
|
||||
)";
|
||||
|
||||
const char sampleCategoriesOpdsStream[] = R"(
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom"
|
||||
xmlns:opds="https://specs.opds.io/opds-1.2">
|
||||
<id>231da20c-0fe0-7345-11b2-d29f50364108</id>
|
||||
<link rel="self"
|
||||
href="/catalog/v2/categories"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=navigation"/>
|
||||
<link rel="start"
|
||||
href="/catalog/v2/root.xml"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=navigation"/>
|
||||
<title>List of categories</title>
|
||||
<updated>2023-07-11T15:35:09Z</updated>
|
||||
|
||||
<entry>
|
||||
<title>gutenberg</title>
|
||||
<link rel="subsection"
|
||||
href="/catalog/v2/entries?category=gutenberg"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
|
||||
<updated>2023-07-11T15:35:09Z</updated>
|
||||
<id>401dbe68-2f7a-5503-b431-054801c30bab</id>
|
||||
<content type="text">All entries with category of 'gutenberg'.</content>
|
||||
</entry>
|
||||
<entry>
|
||||
<title>iFixit</title>
|
||||
<link rel="subsection"
|
||||
href="/catalog/v2/entries?category=iFixit"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
|
||||
<updated>2023-07-11T15:35:09Z</updated>
|
||||
<id>c18e5459-af23-5fbf-0622-ff271bd9a5ad</id>
|
||||
<content type="text">All entries with category of 'iFixit'.</content>
|
||||
</entry>
|
||||
<entry>
|
||||
<title>wikivoyage</title>
|
||||
<link rel="subsection"
|
||||
href="/catalog/v2/entries?category=wikivoyage"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
|
||||
<updated>2023-07-11T15:35:09Z</updated>
|
||||
<id>9a75be6c-7a35-6f52-1a69-bee9ad248459</id>
|
||||
<content type="text">All entries with category of 'wikivoyage'.</content>
|
||||
</entry>
|
||||
<entry>
|
||||
<title>wiktionary</title>
|
||||
<link rel="subsection"
|
||||
href="/catalog/v2/entries?category=wiktionary"
|
||||
type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
|
||||
<updated>2023-07-11T15:35:09Z</updated>
|
||||
<id>7adb9f1a-73d7-0391-1238-d2e2c300ddaa</id>
|
||||
<content type="text">All entries with category of 'wiktionary'.</content>
|
||||
</entry>
|
||||
</feed>
|
||||
)";
|
||||
|
||||
TEST(OpdsParsingTest, languageTest)
|
||||
{
|
||||
FeedLanguages expectedLanguagesFromFeed = {{"abk", "Abkhazian"}, {"zul", "isiZulu"}};
|
||||
EXPECT_EQ(kiwix::readLanguagesFromFeed(sampleLanguageOpdsStream), expectedLanguagesFromFeed);
|
||||
}
|
||||
|
||||
TEST(OpdsParsingTest, categoryTest)
|
||||
{
|
||||
FeedCategories expectedCategoriesFromFeed = {"gutenberg", "iFixit", "wikivoyage", "wiktionary"};
|
||||
EXPECT_EQ(kiwix::readCategoriesFromFeed(sampleCategoriesOpdsStream), expectedCategoriesFromFeed);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue