mirror of https://github.com/kiwix/libkiwix.git
680 lines
19 KiB
C++
680 lines
19 KiB
C++
/*
|
|
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 3 of the License, or
|
|
* any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
* MA 02110-1301, USA.
|
|
*/
|
|
|
|
#include "reader.h"
|
|
|
|
inline char hi(char v) {
|
|
char hex[] = "0123456789abcdef";
|
|
return hex[(v >> 4) & 0xf];
|
|
}
|
|
|
|
inline char lo(char v) {
|
|
char hex[] = "0123456789abcdef";
|
|
return hex[v & 0xf];
|
|
}
|
|
|
|
std::string hexUUID (std::string in) {
|
|
std::ostringstream out;
|
|
for (unsigned n = 0; n < 4; ++n)
|
|
out << hi(in[n]) << lo(in[n]);
|
|
out << '-';
|
|
for (unsigned n = 4; n < 6; ++n)
|
|
out << hi(in[n]) << lo(in[n]);
|
|
out << '-';
|
|
for (unsigned n = 6; n < 8; ++n)
|
|
out << hi(in[n]) << lo(in[n]);
|
|
out << '-';
|
|
for (unsigned n = 8; n < 10; ++n)
|
|
out << hi(in[n]) << lo(in[n]);
|
|
out << '-';
|
|
for (unsigned n = 10; n < 16; ++n)
|
|
out << hi(in[n]) << lo(in[n]);
|
|
std::string op=out.str();
|
|
return op;
|
|
}
|
|
|
|
namespace kiwix {
|
|
|
|
/* Constructor */
|
|
Reader::Reader(const string zimFilePath)
|
|
: zimFileHandler(NULL) {
|
|
string tmpZimFilePath = zimFilePath;
|
|
|
|
/* Remove potential trailing zimaa */
|
|
size_t found = tmpZimFilePath.rfind("zimaa");
|
|
if (found != string::npos &&
|
|
tmpZimFilePath.size() > 5 &&
|
|
found == tmpZimFilePath.size() - 5) {
|
|
tmpZimFilePath.resize(tmpZimFilePath.size() - 2);
|
|
}
|
|
|
|
this->zimFileHandler = new zim::File(tmpZimFilePath);
|
|
|
|
if (this->zimFileHandler != NULL) {
|
|
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
|
|
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
|
|
this->currentArticleOffset = this->firstArticleOffset;
|
|
this->nsACount = this->zimFileHandler->getNamespaceCount('A');
|
|
this->nsICount = this->zimFileHandler->getNamespaceCount('I');
|
|
}
|
|
|
|
/* initialize random seed: */
|
|
srand ( time(NULL) );
|
|
}
|
|
|
|
/* Destructor */
|
|
Reader::~Reader() {
|
|
if (this->zimFileHandler != NULL) {
|
|
delete this->zimFileHandler;
|
|
}
|
|
}
|
|
|
|
zim::File* Reader::getZimFileHandler() {
|
|
return this->zimFileHandler;
|
|
}
|
|
|
|
/* Reset the cursor for GetNextArticle() */
|
|
void Reader::reset() {
|
|
this->currentArticleOffset = this->firstArticleOffset;
|
|
}
|
|
|
|
std::map<std::string, unsigned int> Reader::parseCounterMetadata() {
|
|
std::map<std::string, unsigned int> counters;
|
|
string content, mimeType, item, counterString;
|
|
unsigned int contentLength, counter;
|
|
string counterUrl = "/M/Counter";
|
|
|
|
this->getContentByUrl(counterUrl, content, contentLength, mimeType);
|
|
stringstream ssContent(content);
|
|
|
|
while(getline(ssContent, item, ';')) {
|
|
stringstream ssItem(item);
|
|
getline(ssItem, mimeType, '=');
|
|
getline(ssItem, counterString, '=');
|
|
if (!counterString.empty() && !mimeType.empty()) {
|
|
sscanf(counterString.c_str(), "%u", &counter);
|
|
counters.insert(pair<string, int>(mimeType, counter));
|
|
}
|
|
}
|
|
|
|
return counters;
|
|
}
|
|
|
|
/* Get the count of articles which can be indexed/displayed */
|
|
unsigned int Reader::getArticleCount() {
|
|
std::map<std::string, unsigned int> counterMap = this->parseCounterMetadata();
|
|
unsigned int counter = 0;
|
|
|
|
if (counterMap.empty()) {
|
|
counter = this->nsACount;
|
|
} else {
|
|
std::map<std::string, unsigned int>::const_iterator it = counterMap.find("text/html");
|
|
if (it != counterMap.end())
|
|
counter = it->second;
|
|
}
|
|
|
|
return counter;
|
|
}
|
|
|
|
/* Get the count of medias content in the ZIM file */
|
|
unsigned int Reader::getMediaCount() {
|
|
std::map<std::string, unsigned int> counterMap = this->parseCounterMetadata();
|
|
unsigned int counter = 0;
|
|
|
|
if (counterMap.empty())
|
|
counter = this->nsICount;
|
|
else {
|
|
std::map<std::string, unsigned int>::const_iterator it;
|
|
|
|
it = counterMap.find("image/jpeg");
|
|
if (it != counterMap.end())
|
|
counter += it->second;
|
|
|
|
it = counterMap.find("image/gif");
|
|
if (it != counterMap.end())
|
|
counter += it->second;
|
|
|
|
it = counterMap.find("image/png");
|
|
if (it != counterMap.end())
|
|
counter += it->second;
|
|
}
|
|
|
|
return counter;
|
|
}
|
|
|
|
/* Get the total of all items of a ZIM file, redirects included */
|
|
unsigned int Reader::getGlobalCount() {
|
|
return this->zimFileHandler->getCountArticles();
|
|
}
|
|
|
|
/* Return the UID of the ZIM file */
|
|
string Reader::getId() {
|
|
std::ostringstream s;
|
|
s << this->zimFileHandler->getFileheader().getUuid();
|
|
return s.str();
|
|
}
|
|
|
|
/* Return a page url from a title */
|
|
bool Reader::getPageUrlFromTitle(const string &title, string &url) {
|
|
/* Extract the content from the zim file */
|
|
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findxByTitle('A', title);
|
|
|
|
/* Test if the article was found */
|
|
if (resultPair.first == true) {
|
|
|
|
/* Get the article */
|
|
zim::Article article = *resultPair.second;
|
|
|
|
/* If redirect */
|
|
unsigned int loopCounter = 0;
|
|
while (article.isRedirect() && loopCounter++<42) {
|
|
article = article.getRedirectArticle();
|
|
}
|
|
|
|
url = article.getLongUrl();
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Return an URL from a title*/
|
|
string Reader::getRandomPageUrl() {
|
|
zim::Article article;
|
|
zim::size_type idx;
|
|
std::string mainPageUrl = this->getMainPageUrl();
|
|
|
|
do {
|
|
idx = this->firstArticleOffset +
|
|
(zim::size_type)((double)rand() / ((double)RAND_MAX + 1) * this->nsACount);
|
|
article = zimFileHandler->getArticle(idx);
|
|
} while (article.getLongUrl() == mainPageUrl);
|
|
|
|
return article.getLongUrl().c_str();
|
|
}
|
|
|
|
/* Return the welcome page URL */
|
|
string Reader::getMainPageUrl() {
|
|
string url = "";
|
|
|
|
if (this->zimFileHandler->getFileheader().hasMainPage()) {
|
|
zim::Article article = zimFileHandler->getArticle(this->zimFileHandler->getFileheader().getMainPage());
|
|
url = article.getLongUrl();
|
|
|
|
if (url.empty()) {
|
|
url = getFirstPageUrl();
|
|
}
|
|
} else {
|
|
url = getFirstPageUrl();
|
|
}
|
|
|
|
return url;
|
|
}
|
|
|
|
bool Reader::getFavicon(string &content, string &mimeType) {
|
|
unsigned int contentLength = 0;
|
|
|
|
this->getContentByUrl( "/-/favicon.png", content,
|
|
contentLength, mimeType);
|
|
|
|
if (content.empty()) {
|
|
this->getContentByUrl( "/I/favicon.png", content,
|
|
contentLength, mimeType);
|
|
|
|
|
|
if (content.empty()) {
|
|
this->getContentByUrl( "/I/favicon", content,
|
|
contentLength, mimeType);
|
|
|
|
if (content.empty()) {
|
|
this->getContentByUrl( "/-/favicon", content,
|
|
contentLength, mimeType);
|
|
}
|
|
}
|
|
}
|
|
|
|
return content.empty() ? false : true;
|
|
}
|
|
|
|
/* Return a metatag value */
|
|
bool Reader::getMetatag(const string &name, string &value) {
|
|
unsigned int contentLength = 0;
|
|
string contentType = "";
|
|
|
|
return this->getContentByUrl( "/M/" + name, value,
|
|
contentLength, contentType);
|
|
}
|
|
|
|
string Reader::getTitle() {
|
|
string value;
|
|
this->getMetatag("Title", value);
|
|
if (value.empty()) {
|
|
value = getLastPathElement(zimFileHandler->getFilename());
|
|
std::replace(value.begin(), value.end(), '_', ' ');
|
|
size_t pos = value.find(".zim");
|
|
value = value.substr(0, pos);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
string Reader::getName() {
|
|
string value;
|
|
this->getMetatag("Name", value);
|
|
return value;
|
|
}
|
|
|
|
string Reader::getTags() {
|
|
string value;
|
|
this->getMetatag("Tags", value);
|
|
return value;
|
|
}
|
|
|
|
string Reader::getDescription() {
|
|
string value;
|
|
this->getMetatag("Description", value);
|
|
|
|
/* Mediawiki Collection tends to use the "Subtitle" name */
|
|
if (value.empty()) {
|
|
this->getMetatag("Subtitle", value);
|
|
}
|
|
|
|
return value;
|
|
}
|
|
|
|
string Reader::getLanguage() {
|
|
string value;
|
|
this->getMetatag("Language", value);
|
|
return value;
|
|
}
|
|
|
|
string Reader::getDate() {
|
|
string value;
|
|
this->getMetatag("Date", value);
|
|
return value;
|
|
}
|
|
|
|
string Reader::getCreator() {
|
|
string value;
|
|
this->getMetatag("Creator", value);
|
|
return value;
|
|
}
|
|
|
|
string Reader::getPublisher() {
|
|
string value;
|
|
this->getMetatag("Publisher", value);
|
|
return value;
|
|
}
|
|
|
|
string Reader::getOrigId() {
|
|
string value;
|
|
this->getMetatag("startfileuid", value);
|
|
if(value.empty())
|
|
return "";
|
|
std::string id=value;
|
|
std::string origID;
|
|
std::string temp="";
|
|
unsigned int k=0;
|
|
char tempArray[16]="";
|
|
for(unsigned int i=0; i<id.size(); i++)
|
|
{
|
|
if(id[i]=='\n')
|
|
{
|
|
tempArray[k]= atoi(temp.c_str());
|
|
temp="";
|
|
k++;
|
|
}
|
|
else
|
|
{
|
|
temp+=id[i];
|
|
}
|
|
}
|
|
origID=hexUUID(tempArray);
|
|
return origID;
|
|
}
|
|
|
|
/* Return the first page URL */
|
|
string Reader::getFirstPageUrl() {
|
|
string url;
|
|
|
|
zim::size_type firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A');
|
|
zim::Article article = zimFileHandler->getArticle(firstPageOffset);
|
|
url = article.getLongUrl();
|
|
|
|
return url;
|
|
}
|
|
|
|
bool Reader::parseUrl(const string &url, char *ns, string &title) {
|
|
/* Offset to visit the url */
|
|
unsigned int urlLength = url.size();
|
|
unsigned int offset = 0;
|
|
|
|
/* Ignore the '/' */
|
|
while ((offset < urlLength) && (url[offset] == '/')) offset++;
|
|
|
|
/* Get namespace */
|
|
while ((offset < urlLength) && (url[offset] != '/')) {
|
|
*ns= url[offset];
|
|
offset++;
|
|
}
|
|
|
|
/* Ignore the '/' */
|
|
while ((offset < urlLength) && (url[offset] == '/')) offset++;
|
|
|
|
/* Get content title */
|
|
unsigned int titleOffset = offset;
|
|
while (offset < urlLength) {
|
|
offset++;
|
|
}
|
|
|
|
/* unescape title */
|
|
title = url.substr(titleOffset, offset - titleOffset);
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Return article by url */
|
|
bool Reader::getArticleObjectByDecodedUrl(const string &url, zim::Article &article) {
|
|
bool retVal = false;
|
|
|
|
if (this->zimFileHandler != NULL) {
|
|
|
|
/* Parse the url */
|
|
char ns = 0;
|
|
string titleStr;
|
|
this->parseUrl(url, &ns, titleStr);
|
|
|
|
/* Main page */
|
|
if (titleStr.empty() && ns == 0) {
|
|
this->parseUrl(this->getMainPageUrl(), &ns, titleStr);
|
|
}
|
|
|
|
/* Extract the content from the zim file */
|
|
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findx(ns, titleStr);
|
|
|
|
/* Test if the article was found */
|
|
if (resultPair.first == true) {
|
|
article = zimFileHandler->getArticle(resultPair.second.getIndex());
|
|
retVal = true;
|
|
}
|
|
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/* Return the mimeType without the content */
|
|
bool Reader::getMimeTypeByUrl(const string &url, string &mimeType) {
|
|
bool retVal = false;
|
|
|
|
if (this->zimFileHandler != NULL) {
|
|
|
|
zim::Article article;
|
|
if (this->getArticleObjectByDecodedUrl(url, article)) {
|
|
try {
|
|
mimeType = string(article.getMimeType().data(), article.getMimeType().size());
|
|
} catch (exception &e) {
|
|
cerr << "Unable to get the mimetype for "<< url << ":" << e.what() << endl;
|
|
mimeType = "application/octet-stream";
|
|
}
|
|
retVal = true;
|
|
} else {
|
|
mimeType = "";
|
|
}
|
|
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/* Get a content from a zim file */
|
|
bool Reader::getContentByUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) {
|
|
return this->getContentByEncodedUrl(url, content, contentLength, contentType);
|
|
}
|
|
|
|
bool Reader::getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) {
|
|
return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, baseUrl);
|
|
}
|
|
|
|
bool Reader::getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) {
|
|
std::string stubRedirectUrl;
|
|
return this->getContentByEncodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, stubRedirectUrl);
|
|
}
|
|
|
|
bool Reader::getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) {
|
|
std::string stubRedirectUrl;
|
|
return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, stubRedirectUrl);
|
|
}
|
|
|
|
bool Reader::getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) {
|
|
bool retVal = false;
|
|
content="";
|
|
contentType="";
|
|
contentLength = 0;
|
|
if (this->zimFileHandler != NULL) {
|
|
|
|
zim::Article article;
|
|
if (this->getArticleObjectByDecodedUrl(url, article)) {
|
|
|
|
/* If redirect */
|
|
unsigned int loopCounter = 0;
|
|
while (article.isRedirect() && loopCounter++<42) {
|
|
article = article.getRedirectArticle();
|
|
}
|
|
|
|
if (loopCounter < 42) {
|
|
/* Compute base url (might be different from the url if redirects */
|
|
baseUrl = "/" + std::string(1, article.getNamespace()) + "/" + article.getUrl();
|
|
|
|
/* Get the content mime-type */
|
|
try {
|
|
contentType = string(article.getMimeType().data(), article.getMimeType().size());
|
|
} catch (exception &e) {
|
|
cerr << "Unable to get the mimetype for "<< baseUrl<< ":" << e.what() << endl;
|
|
contentType = "application/octet-stream";
|
|
}
|
|
|
|
/* Get the data */
|
|
content = string(article.getData().data(), article.getArticleSize());
|
|
}
|
|
|
|
/* Try to set a stub HTML header/footer if necesssary */
|
|
if (contentType.find("text/html") != string::npos &&
|
|
content.find("<body") == std::string::npos &&
|
|
content.find("<BODY") == std::string::npos) {
|
|
content = "<html><head><title>" + article.getTitle() + "</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /></head><body>" + content + "</body></html>";
|
|
}
|
|
|
|
/* Get the data length */
|
|
contentLength = article.getArticleSize();
|
|
|
|
/* Set return value */
|
|
retVal = true;
|
|
}
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/* Check if an article exists */
|
|
bool Reader::urlExists(const string &url) {
|
|
char ns = 0;
|
|
string titleStr;
|
|
this->parseUrl(url, &ns, titleStr);
|
|
zim::File::const_iterator findItr = zimFileHandler->find(ns, titleStr);
|
|
return findItr->getUrl() == titleStr;
|
|
}
|
|
|
|
/* Search titles by prefix */
|
|
bool Reader::searchSuggestions(const string &prefix, unsigned int suggestionsCount, const bool reset) {
|
|
bool retVal = false;
|
|
zim::File::const_iterator articleItr;
|
|
|
|
/* Reset the suggestions otherwise check if the suggestions number is less than the suggestionsCount */
|
|
if (reset) {
|
|
this->suggestions.clear();
|
|
this->suggestionsOffset = this->suggestions.begin();
|
|
} else {
|
|
if (this->suggestions.size() > suggestionsCount) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* Return if no prefix */
|
|
if (prefix.size() == 0) {
|
|
return false;
|
|
}
|
|
|
|
for (articleItr = zimFileHandler->findByTitle('A', prefix);
|
|
articleItr != zimFileHandler->end() &&
|
|
articleItr->getTitle().compare(0, prefix.size(), prefix) == 0 &&
|
|
this->suggestions.size() < suggestionsCount ;
|
|
++articleItr) {
|
|
|
|
/* Extract the interesting part of article title & url */
|
|
std::string normalizedArticleTitle = kiwix::normalize(articleItr->getTitle());
|
|
std::string articleFinalUrl = "/A/"+articleItr->getUrl();
|
|
if (articleItr->isRedirect()) {
|
|
zim::Article article = *articleItr;
|
|
unsigned int loopCounter = 0;
|
|
while (article.isRedirect() && loopCounter++<42) {
|
|
article = article.getRedirectArticle();
|
|
}
|
|
articleFinalUrl = "/A/"+article.getUrl();
|
|
}
|
|
|
|
/* Go through all already found suggestions and skip if this
|
|
article is already in the suggestions list (with an other
|
|
title) */
|
|
bool insert = true;
|
|
std::vector< std::vector<std::string> >::iterator suggestionItr;
|
|
for (suggestionItr = this->suggestions.begin(); suggestionItr != this->suggestions.end(); suggestionItr++) {
|
|
int result = normalizedArticleTitle.compare((*suggestionItr)[2]);
|
|
if (result == 0 && articleFinalUrl.compare((*suggestionItr)[1]) == 0) {
|
|
insert = false;
|
|
break;
|
|
} else if (result < 0) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Insert if possible */
|
|
if (insert) {
|
|
std::vector<std::string> suggestion;
|
|
suggestion.push_back(articleItr->getTitle());
|
|
suggestion.push_back(articleFinalUrl);
|
|
suggestion.push_back(normalizedArticleTitle);
|
|
this->suggestions.insert(suggestionItr, suggestion);
|
|
}
|
|
|
|
/* Suggestions where found */
|
|
retVal = true;
|
|
}
|
|
|
|
/* Set the cursor to the begining */
|
|
this->suggestionsOffset = this->suggestions.begin();
|
|
|
|
return retVal;
|
|
}
|
|
|
|
std::vector<std::string> Reader::getTitleVariants(const std::string &title) {
|
|
std::vector<std::string> variants;
|
|
variants.push_back(title);
|
|
variants.push_back(kiwix::ucFirst(title));
|
|
variants.push_back(kiwix::lcFirst(title));
|
|
variants.push_back(kiwix::toTitle(title));
|
|
return variants;
|
|
}
|
|
|
|
/* Try also a few variations of the prefix to have better results */
|
|
bool Reader::searchSuggestionsSmart(const string &prefix, unsigned int suggestionsCount) {
|
|
std::vector<std::string> variants = this->getTitleVariants(prefix);
|
|
bool retVal;
|
|
|
|
this->suggestions.clear();
|
|
this->suggestionsOffset = this->suggestions.begin();
|
|
for (std::vector<std::string>::iterator variantsItr = variants.begin();
|
|
variantsItr != variants.end();
|
|
variantsItr++) {
|
|
retVal = this->searchSuggestions(*variantsItr, suggestionsCount, false) || retVal;
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/* Get next suggestion */
|
|
bool Reader::getNextSuggestion(string &title) {
|
|
if (this->suggestionsOffset != this->suggestions.end()) {
|
|
/* title */
|
|
title = (*(this->suggestionsOffset))[0];
|
|
|
|
/* increment the cursor for the next call */
|
|
this->suggestionsOffset++;
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool Reader::getNextSuggestion(string &title, string &url) {
|
|
if (this->suggestionsOffset != this->suggestions.end()) {
|
|
/* title */
|
|
title = (*(this->suggestionsOffset))[0];
|
|
url = (*(this->suggestionsOffset))[1];
|
|
|
|
/* increment the cursor for the next call */
|
|
this->suggestionsOffset++;
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Check if the file has as checksum */
|
|
bool Reader::canCheckIntegrity() {
|
|
return this->zimFileHandler->getChecksum() != "";
|
|
}
|
|
|
|
/* Return true if corrupted, false otherwise */
|
|
bool Reader::isCorrupted() {
|
|
try {
|
|
if (this->zimFileHandler->verify() == true)
|
|
return false;
|
|
} catch (exception &e) {
|
|
cerr << e.what() << endl;
|
|
return true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Return the file size, works also for splitted files */
|
|
unsigned int Reader::getFileSize() {
|
|
zim::File *file = this->getZimFileHandler();
|
|
zim::offset_type size = 0;
|
|
|
|
if (file != NULL) {
|
|
size = file->getFilesize();
|
|
}
|
|
|
|
return (size / 1024);
|
|
}
|
|
}
|