mirror of https://github.com/kiwix/libkiwix.git
603 lines
13 KiB
C++
603 lines
13 KiB
C++
/*
|
|
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 3 of the License, or
|
|
* any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
* MA 02110-1301, USA.
|
|
*/
|
|
|
|
#include "reader.h"
|
|
#include <time.h>
|
|
|
|
#include <zim/search.h>
|
|
#include <zim/item.h>
|
|
#include <zim/error.h>
|
|
|
|
#include "tools/otherTools.h"
|
|
|
|
inline char hi(char v)
|
|
{
|
|
char hex[] = "0123456789abcdef";
|
|
return hex[(v >> 4) & 0xf];
|
|
}
|
|
|
|
inline char lo(char v)
|
|
{
|
|
char hex[] = "0123456789abcdef";
|
|
return hex[v & 0xf];
|
|
}
|
|
|
|
std::string hexUUID(std::string in)
|
|
{
|
|
std::ostringstream out;
|
|
for (unsigned n = 0; n < 4; ++n) {
|
|
out << hi(in[n]) << lo(in[n]);
|
|
}
|
|
out << '-';
|
|
for (unsigned n = 4; n < 6; ++n) {
|
|
out << hi(in[n]) << lo(in[n]);
|
|
}
|
|
out << '-';
|
|
for (unsigned n = 6; n < 8; ++n) {
|
|
out << hi(in[n]) << lo(in[n]);
|
|
}
|
|
out << '-';
|
|
for (unsigned n = 8; n < 10; ++n) {
|
|
out << hi(in[n]) << lo(in[n]);
|
|
}
|
|
out << '-';
|
|
for (unsigned n = 10; n < 16; ++n) {
|
|
out << hi(in[n]) << lo(in[n]);
|
|
}
|
|
std::string op = out.str();
|
|
return op;
|
|
}
|
|
|
|
namespace kiwix
|
|
{
|
|
/* Constructor */
|
|
Reader::Reader(const string zimFilePath)
|
|
: zimArchive(nullptr),
|
|
zimFilePath(zimFilePath)
|
|
{
|
|
string tmpZimFilePath = zimFilePath;
|
|
|
|
/* Remove potential trailing zimaa */
|
|
size_t found = tmpZimFilePath.rfind("zimaa");
|
|
if (found != string::npos && tmpZimFilePath.size() > 5
|
|
&& found == tmpZimFilePath.size() - 5) {
|
|
tmpZimFilePath.resize(tmpZimFilePath.size() - 2);
|
|
}
|
|
|
|
zimArchive.reset(new zim::Archive(tmpZimFilePath));
|
|
|
|
/* initialize random seed: */
|
|
srand(time(nullptr));
|
|
}
|
|
|
|
zim::Archive* Reader::getZimArchive() const
|
|
{
|
|
return zimArchive.get();
|
|
}
|
|
|
|
MimeCounterType Reader::parseCounterMetadata() const
|
|
{
|
|
auto counterContent = zimArchive->getMetadata("Counter");
|
|
return parseMimetypeCounter(counterContent);
|
|
}
|
|
|
|
/* Get the count of articles which can be indexed/displayed */
|
|
unsigned int Reader::getArticleCount() const
|
|
{
|
|
std::map<const std::string, unsigned int> counterMap
|
|
= this->parseCounterMetadata();
|
|
unsigned int counter = 0;
|
|
|
|
for(auto &pair:counterMap) {
|
|
if (startsWith(pair.first, "text/html")) {
|
|
counter += pair.second;
|
|
}
|
|
}
|
|
|
|
return counter;
|
|
}
|
|
|
|
/* Get the count of medias content in the ZIM file */
|
|
unsigned int Reader::getMediaCount() const
|
|
{
|
|
std::map<const std::string, unsigned int> counterMap
|
|
= this->parseCounterMetadata();
|
|
unsigned int counter = 0;
|
|
|
|
auto it = counterMap.find("image/jpeg");
|
|
if (it != counterMap.end()) {
|
|
counter += it->second;
|
|
}
|
|
|
|
it = counterMap.find("image/gif");
|
|
if (it != counterMap.end()) {
|
|
counter += it->second;
|
|
}
|
|
|
|
it = counterMap.find("image/png");
|
|
if (it != counterMap.end()) {
|
|
counter += it->second;
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
/* Get the total of all items of a ZIM file, redirects included */
|
|
unsigned int Reader::getGlobalCount() const
|
|
{
|
|
return zimArchive->getEntryCount();
|
|
}
|
|
|
|
/* Return the UID of the ZIM file */
|
|
string Reader::getId() const
|
|
{
|
|
std::ostringstream s;
|
|
s << zimArchive->getUuid();
|
|
return s.str();
|
|
}
|
|
|
|
Entry Reader::getRandomPage() const
|
|
{
|
|
auto mainPagePath = zimArchive->getMainEntry().getPath();
|
|
int watchdog = 42;
|
|
|
|
while (--watchdog){
|
|
auto idx = (zim::size_type)((double)rand() / ((double)RAND_MAX + 1)
|
|
* zimArchive->getEntryCount());
|
|
auto entry = zimArchive->getEntryByPath(idx);
|
|
|
|
if (entry.getPath()==mainPagePath) {
|
|
continue;
|
|
}
|
|
auto item = entry.getItem(true);
|
|
if (item.getMimetype() == "text/html") {
|
|
return entry;
|
|
}
|
|
}
|
|
throw NoEntry();
|
|
}
|
|
|
|
Entry Reader::getMainPage() const
|
|
{
|
|
return zimArchive->getMainEntry();
|
|
}
|
|
|
|
bool Reader::getFavicon(string& content, string& mimeType) const
|
|
{
|
|
try {
|
|
auto entry = zimArchive->getFaviconEntry();
|
|
auto item = entry.getItem(true);
|
|
content = item.getData();
|
|
mimeType = item.getMimetype();
|
|
return true;
|
|
} catch(zim::EntryNotFound& e) {};
|
|
|
|
return false;
|
|
}
|
|
|
|
string Reader::getZimFilePath() const
|
|
{
|
|
return zimFilePath;
|
|
}
|
|
/* Return a metatag value */
|
|
bool Reader::getMetadata(const string& name, string& value) const
|
|
{
|
|
try {
|
|
value = zimArchive->getMetadata(name);
|
|
return true;
|
|
} catch(zim::EntryNotFound& e) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
#define METADATA(NAME) std::string v; getMetadata(NAME, v); return v;
|
|
|
|
string Reader::getName() const
|
|
{
|
|
METADATA("Name")
|
|
}
|
|
|
|
string Reader::getTitle() const
|
|
{
|
|
string value = zimArchive->getMetadata("Title");
|
|
if (value.empty()) {
|
|
value = getLastPathElement(zimFilePath);
|
|
std::replace(value.begin(), value.end(), '_', ' ');
|
|
size_t pos = value.find(".zim");
|
|
value = value.substr(0, pos);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
string Reader::getCreator() const
|
|
{
|
|
METADATA("Creator")
|
|
}
|
|
|
|
string Reader::getPublisher() const
|
|
{
|
|
METADATA("Publisher")
|
|
}
|
|
|
|
string Reader::getDate() const
|
|
{
|
|
METADATA("Date")
|
|
}
|
|
|
|
string Reader::getDescription() const
|
|
{
|
|
string value;
|
|
this->getMetadata("Description", value);
|
|
|
|
/* Mediawiki Collection tends to use the "Subtitle" name */
|
|
if (value.empty()) {
|
|
this->getMetadata("Subtitle", value);
|
|
}
|
|
|
|
return value;
|
|
}
|
|
|
|
string Reader::getLongDescription() const
|
|
{
|
|
METADATA("LongDescription")
|
|
}
|
|
|
|
string Reader::getLanguage() const
|
|
{
|
|
METADATA("Language")
|
|
}
|
|
|
|
string Reader::getLicense() const
|
|
{
|
|
METADATA("License")
|
|
}
|
|
|
|
string Reader::getTags(bool original) const
|
|
{
|
|
string tags_str;
|
|
getMetadata("Tags", tags_str);
|
|
if (original) {
|
|
return tags_str;
|
|
}
|
|
auto tags = convertTags(tags_str);
|
|
return join(tags, ";");
|
|
}
|
|
|
|
|
|
string Reader::getTagStr(const std::string& tagName) const
|
|
{
|
|
string tags_str;
|
|
getMetadata("Tags", tags_str);
|
|
return getTagValueFromTagList(convertTags(tags_str), tagName);
|
|
}
|
|
|
|
bool Reader::getTagBool(const std::string& tagName) const
|
|
{
|
|
return convertStrToBool(getTagStr(tagName));
|
|
}
|
|
|
|
string Reader::getRelation() const
|
|
{
|
|
METADATA("Relation")
|
|
}
|
|
|
|
string Reader::getFlavour() const
|
|
{
|
|
METADATA("Flavour")
|
|
}
|
|
|
|
string Reader::getSource() const
|
|
{
|
|
METADATA("Source")
|
|
}
|
|
|
|
string Reader::getScraper() const
|
|
{
|
|
METADATA("Scraper")
|
|
}
|
|
#undef METADATA
|
|
|
|
string Reader::getOrigId() const
|
|
{
|
|
string value;
|
|
this->getMetadata("startfileuid", value);
|
|
if (value.empty()) {
|
|
return "";
|
|
}
|
|
std::string id = value;
|
|
std::string origID;
|
|
std::string temp = "";
|
|
unsigned int k = 0;
|
|
char tempArray[16] = "";
|
|
for (unsigned int i = 0; i < id.size(); i++) {
|
|
if (id[i] == '\n') {
|
|
tempArray[k] = atoi(temp.c_str());
|
|
temp = "";
|
|
k++;
|
|
} else {
|
|
temp += id[i];
|
|
}
|
|
}
|
|
origID = hexUUID(tempArray);
|
|
return origID;
|
|
}
|
|
|
|
Entry Reader::getEntryFromPath(const std::string& path) const
|
|
{
|
|
if (!this->zimArchive) {
|
|
throw NoEntry();
|
|
}
|
|
|
|
if (path.empty() || path == "/") {
|
|
return getMainPage();
|
|
}
|
|
|
|
try {
|
|
return zimArchive->getEntryByPath(path);
|
|
} catch (zim::EntryNotFound& e) {
|
|
throw NoEntry();
|
|
}
|
|
}
|
|
|
|
Entry Reader::getEntryFromEncodedPath(const std::string& path) const
|
|
{
|
|
return getEntryFromPath(urlDecode(path, true));
|
|
}
|
|
|
|
Entry Reader::getEntryFromTitle(const std::string& title) const
|
|
{
|
|
if (!this->zimArchive) {
|
|
throw NoEntry();
|
|
}
|
|
|
|
try {
|
|
return zimArchive->getEntryByTitle(title);
|
|
} catch(zim::EntryNotFound& e) {
|
|
throw NoEntry();
|
|
}
|
|
}
|
|
|
|
bool Reader::pathExists(const string& path) const
|
|
{
|
|
if (!zimArchive)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return zimArchive->hasEntryByPath(path);
|
|
}
|
|
|
|
/* Does the ZIM file has a fulltext index */
|
|
bool Reader::hasFulltextIndex() const
|
|
{
|
|
if (!zimArchive)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return zimArchive->hasFulltextIndex();
|
|
}
|
|
|
|
/* Search titles by prefix */
|
|
|
|
bool Reader::searchSuggestions(const string& prefix,
|
|
unsigned int suggestionsCount,
|
|
const bool reset)
|
|
{
|
|
/* Reset the suggestions otherwise check if the suggestions number is less
|
|
* than the suggestionsCount */
|
|
if (reset) {
|
|
this->suggestions.clear();
|
|
this->suggestionsOffset = this->suggestions.begin();
|
|
} else {
|
|
if (this->suggestions.size() > suggestionsCount) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
auto ret = searchSuggestions(prefix, suggestionsCount, this->suggestions);
|
|
|
|
/* Set the cursor to the begining */
|
|
this->suggestionsOffset = this->suggestions.begin();
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
bool Reader::searchSuggestions(const string& prefix,
|
|
unsigned int suggestionsCount,
|
|
SuggestionsList_t& results)
|
|
{
|
|
bool retVal = false;
|
|
|
|
/* Return if no prefix */
|
|
if (prefix.size() == 0) {
|
|
return false;
|
|
}
|
|
|
|
for (auto& entry: zimArchive->findByTitle(prefix)) {
|
|
if (results.size() >= suggestionsCount) {
|
|
break;
|
|
}
|
|
/* Extract the interesting part of article title & url */
|
|
std::string normalizedArticleTitle
|
|
= kiwix::normalize(entry.getTitle());
|
|
|
|
// Get the final path.
|
|
auto item = entry.getItem(true);
|
|
std::string articleFinalUrl = item.getPath();
|
|
|
|
/* Go through all already found suggestions and skip if this
|
|
article is already in the suggestions list (with an other
|
|
title) */
|
|
bool insert = true;
|
|
std::vector<std::vector<std::string>>::iterator suggestionItr;
|
|
for (suggestionItr = results.begin();
|
|
suggestionItr != results.end();
|
|
suggestionItr++) {
|
|
int result = normalizedArticleTitle.compare((*suggestionItr)[2]);
|
|
if (result == 0 && articleFinalUrl.compare((*suggestionItr)[1]) == 0) {
|
|
insert = false;
|
|
break;
|
|
} else if (result < 0) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Insert if possible */
|
|
if (insert) {
|
|
std::vector<std::string> suggestion;
|
|
suggestion.push_back(entry.getTitle());
|
|
suggestion.push_back(articleFinalUrl);
|
|
suggestion.push_back(normalizedArticleTitle);
|
|
results.insert(suggestionItr, suggestion);
|
|
}
|
|
|
|
/* Suggestions where found */
|
|
retVal = true;
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
std::vector<std::string> Reader::getTitleVariants(
|
|
const std::string& title) const
|
|
{
|
|
std::vector<std::string> variants;
|
|
variants.push_back(title);
|
|
variants.push_back(kiwix::ucFirst(title));
|
|
variants.push_back(kiwix::lcFirst(title));
|
|
variants.push_back(kiwix::toTitle(title));
|
|
return variants;
|
|
}
|
|
|
|
|
|
bool Reader::searchSuggestionsSmart(const string& prefix,
|
|
unsigned int suggestionsCount)
|
|
{
|
|
this->suggestions.clear();
|
|
this->suggestionsOffset = this->suggestions.begin();
|
|
|
|
auto ret = searchSuggestionsSmart(prefix, suggestionsCount, this->suggestions);
|
|
|
|
this->suggestionsOffset = this->suggestions.begin();
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* Try also a few variations of the prefix to have better results */
|
|
bool Reader::searchSuggestionsSmart(const string& prefix,
|
|
unsigned int suggestionsCount,
|
|
SuggestionsList_t& results)
|
|
{
|
|
std::vector<std::string> variants = this->getTitleVariants(prefix);
|
|
bool retVal = false;
|
|
|
|
/* Try to search in the title using fulltext search database */
|
|
auto suggestionSearch = zim::Search(*zimArchive);
|
|
suggestionSearch.set_query(prefix);
|
|
suggestionSearch.set_range(0, suggestionsCount);
|
|
suggestionSearch.set_suggestion_mode(true);
|
|
|
|
if (suggestionSearch.get_matches_estimated()) {
|
|
for (auto current = suggestionSearch.begin();
|
|
current != suggestionSearch.end();
|
|
current++) {
|
|
std::vector<std::string> suggestion;
|
|
suggestion.push_back(current->getTitle());
|
|
suggestion.push_back(current->getPath());
|
|
suggestion.push_back(kiwix::normalize(current->getTitle()));
|
|
results.push_back(suggestion);
|
|
}
|
|
retVal = true;
|
|
} else {
|
|
for (std::vector<std::string>::iterator variantsItr = variants.begin();
|
|
variantsItr != variants.end();
|
|
variantsItr++) {
|
|
retVal = this->searchSuggestions(*variantsItr, suggestionsCount, results)
|
|
|| retVal;
|
|
}
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/* Get next suggestion */
|
|
bool Reader::getNextSuggestion(string& title)
|
|
{
|
|
if (this->suggestionsOffset != this->suggestions.end()) {
|
|
/* title */
|
|
title = (*(this->suggestionsOffset))[0];
|
|
|
|
/* increment the cursor for the next call */
|
|
this->suggestionsOffset++;
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool Reader::getNextSuggestion(string& title, string& url)
|
|
{
|
|
if (this->suggestionsOffset != this->suggestions.end()) {
|
|
/* title */
|
|
title = (*(this->suggestionsOffset))[0];
|
|
url = (*(this->suggestionsOffset))[1];
|
|
|
|
/* increment the cursor for the next call */
|
|
this->suggestionsOffset++;
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Check if the file has as checksum */
|
|
bool Reader::canCheckIntegrity() const
|
|
{
|
|
return zimArchive->hasChecksum();
|
|
}
|
|
|
|
/* Return true if corrupted, false otherwise */
|
|
bool Reader::isCorrupted() const
|
|
{
|
|
try {
|
|
if (zimArchive->check() == true) {
|
|
return false;
|
|
}
|
|
} catch (exception& e) {
|
|
cerr << e.what() << endl;
|
|
return true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Return the file size, works also for splitted files */
|
|
unsigned int Reader::getFileSize() const
|
|
{
|
|
if (!zimArchive) {
|
|
return 0;
|
|
}
|
|
return zimArchive->getFilesize() / 1024;
|
|
}
|
|
|
|
}
|