libkiwix/src/common/kiwix/reader.cpp

680 lines
19 KiB
C++

/*
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#include "reader.h"
inline char hi(char v) {
char hex[] = "0123456789abcdef";
return hex[(v >> 4) & 0xf];
}
inline char lo(char v) {
char hex[] = "0123456789abcdef";
return hex[v & 0xf];
}
std::string hexUUID (std::string in) {
std::ostringstream out;
for (unsigned n = 0; n < 4; ++n)
out << hi(in[n]) << lo(in[n]);
out << '-';
for (unsigned n = 4; n < 6; ++n)
out << hi(in[n]) << lo(in[n]);
out << '-';
for (unsigned n = 6; n < 8; ++n)
out << hi(in[n]) << lo(in[n]);
out << '-';
for (unsigned n = 8; n < 10; ++n)
out << hi(in[n]) << lo(in[n]);
out << '-';
for (unsigned n = 10; n < 16; ++n)
out << hi(in[n]) << lo(in[n]);
std::string op=out.str();
return op;
}
namespace kiwix {
/* Constructor */
Reader::Reader(const string zimFilePath)
: zimFileHandler(NULL) {
string tmpZimFilePath = zimFilePath;
/* Remove potential trailing zimaa */
size_t found = tmpZimFilePath.rfind("zimaa");
if (found != string::npos &&
tmpZimFilePath.size() > 5 &&
found == tmpZimFilePath.size() - 5) {
tmpZimFilePath.resize(tmpZimFilePath.size() - 2);
}
this->zimFileHandler = new zim::File(tmpZimFilePath);
if (this->zimFileHandler != NULL) {
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
this->currentArticleOffset = this->firstArticleOffset;
this->nsACount = this->zimFileHandler->getNamespaceCount('A');
this->nsICount = this->zimFileHandler->getNamespaceCount('I');
}
/* initialize random seed: */
srand ( time(NULL) );
}
/* Destructor */
Reader::~Reader() {
if (this->zimFileHandler != NULL) {
delete this->zimFileHandler;
}
}
zim::File* Reader::getZimFileHandler() {
return this->zimFileHandler;
}
/* Reset the cursor for GetNextArticle() */
void Reader::reset() {
this->currentArticleOffset = this->firstArticleOffset;
}
std::map<std::string, unsigned int> Reader::parseCounterMetadata() {
std::map<std::string, unsigned int> counters;
string content, mimeType, item, counterString;
unsigned int contentLength, counter;
string counterUrl = "/M/Counter";
this->getContentByUrl(counterUrl, content, contentLength, mimeType);
stringstream ssContent(content);
while(getline(ssContent, item, ';')) {
stringstream ssItem(item);
getline(ssItem, mimeType, '=');
getline(ssItem, counterString, '=');
if (!counterString.empty() && !mimeType.empty()) {
sscanf(counterString.c_str(), "%u", &counter);
counters.insert(pair<string, int>(mimeType, counter));
}
}
return counters;
}
/* Get the count of articles which can be indexed/displayed */
unsigned int Reader::getArticleCount() {
std::map<std::string, unsigned int> counterMap = this->parseCounterMetadata();
unsigned int counter = 0;
if (counterMap.empty()) {
counter = this->nsACount;
} else {
std::map<std::string, unsigned int>::const_iterator it = counterMap.find("text/html");
if (it != counterMap.end())
counter = it->second;
}
return counter;
}
/* Get the count of medias content in the ZIM file */
unsigned int Reader::getMediaCount() {
std::map<std::string, unsigned int> counterMap = this->parseCounterMetadata();
unsigned int counter = 0;
if (counterMap.empty())
counter = this->nsICount;
else {
std::map<std::string, unsigned int>::const_iterator it;
it = counterMap.find("image/jpeg");
if (it != counterMap.end())
counter += it->second;
it = counterMap.find("image/gif");
if (it != counterMap.end())
counter += it->second;
it = counterMap.find("image/png");
if (it != counterMap.end())
counter += it->second;
}
return counter;
}
/* Get the total of all items of a ZIM file, redirects included */
unsigned int Reader::getGlobalCount() {
return this->zimFileHandler->getCountArticles();
}
/* Return the UID of the ZIM file */
string Reader::getId() {
std::ostringstream s;
s << this->zimFileHandler->getFileheader().getUuid();
return s.str();
}
/* Return a page url from a title */
bool Reader::getPageUrlFromTitle(const string &title, string &url) {
/* Extract the content from the zim file */
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findxByTitle('A', title);
/* Test if the article was found */
if (resultPair.first == true) {
/* Get the article */
zim::Article article = *resultPair.second;
/* If redirect */
unsigned int loopCounter = 0;
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
url = article.getLongUrl();
return true;
}
return false;
}
/* Return an URL from a title*/
string Reader::getRandomPageUrl() {
zim::Article article;
zim::size_type idx;
std::string mainPageUrl = this->getMainPageUrl();
do {
idx = this->firstArticleOffset +
(zim::size_type)((double)rand() / ((double)RAND_MAX + 1) * this->nsACount);
article = zimFileHandler->getArticle(idx);
} while (article.getLongUrl() == mainPageUrl);
return article.getLongUrl().c_str();
}
/* Return the welcome page URL */
string Reader::getMainPageUrl() {
string url = "";
if (this->zimFileHandler->getFileheader().hasMainPage()) {
zim::Article article = zimFileHandler->getArticle(this->zimFileHandler->getFileheader().getMainPage());
url = article.getLongUrl();
if (url.empty()) {
url = getFirstPageUrl();
}
} else {
url = getFirstPageUrl();
}
return url;
}
bool Reader::getFavicon(string &content, string &mimeType) {
unsigned int contentLength = 0;
this->getContentByUrl( "/-/favicon.png", content,
contentLength, mimeType);
if (content.empty()) {
this->getContentByUrl( "/I/favicon.png", content,
contentLength, mimeType);
if (content.empty()) {
this->getContentByUrl( "/I/favicon", content,
contentLength, mimeType);
if (content.empty()) {
this->getContentByUrl( "/-/favicon", content,
contentLength, mimeType);
}
}
}
return content.empty() ? false : true;
}
/* Return a metatag value */
bool Reader::getMetatag(const string &name, string &value) {
unsigned int contentLength = 0;
string contentType = "";
return this->getContentByUrl( "/M/" + name, value,
contentLength, contentType);
}
string Reader::getTitle() {
string value;
this->getMetatag("Title", value);
if (value.empty()) {
value = getLastPathElement(zimFileHandler->getFilename());
std::replace(value.begin(), value.end(), '_', ' ');
size_t pos = value.find(".zim");
value = value.substr(0, pos);
}
return value;
}
string Reader::getName() {
string value;
this->getMetatag("Name", value);
return value;
}
string Reader::getTags() {
string value;
this->getMetatag("Tags", value);
return value;
}
string Reader::getDescription() {
string value;
this->getMetatag("Description", value);
/* Mediawiki Collection tends to use the "Subtitle" name */
if (value.empty()) {
this->getMetatag("Subtitle", value);
}
return value;
}
string Reader::getLanguage() {
string value;
this->getMetatag("Language", value);
return value;
}
string Reader::getDate() {
string value;
this->getMetatag("Date", value);
return value;
}
string Reader::getCreator() {
string value;
this->getMetatag("Creator", value);
return value;
}
string Reader::getPublisher() {
string value;
this->getMetatag("Publisher", value);
return value;
}
string Reader::getOrigId() {
string value;
this->getMetatag("startfileuid", value);
if(value.empty())
return "";
std::string id=value;
std::string origID;
std::string temp="";
unsigned int k=0;
char tempArray[16]="";
for(unsigned int i=0; i<id.size(); i++)
{
if(id[i]=='\n')
{
tempArray[k]= atoi(temp.c_str());
temp="";
k++;
}
else
{
temp+=id[i];
}
}
origID=hexUUID(tempArray);
return origID;
}
/* Return the first page URL */
string Reader::getFirstPageUrl() {
string url;
zim::size_type firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A');
zim::Article article = zimFileHandler->getArticle(firstPageOffset);
url = article.getLongUrl();
return url;
}
bool Reader::parseUrl(const string &url, char *ns, string &title) {
/* Offset to visit the url */
unsigned int urlLength = url.size();
unsigned int offset = 0;
/* Ignore the '/' */
while ((offset < urlLength) && (url[offset] == '/')) offset++;
/* Get namespace */
while ((offset < urlLength) && (url[offset] != '/')) {
*ns= url[offset];
offset++;
}
/* Ignore the '/' */
while ((offset < urlLength) && (url[offset] == '/')) offset++;
/* Get content title */
unsigned int titleOffset = offset;
while (offset < urlLength) {
offset++;
}
/* unescape title */
title = url.substr(titleOffset, offset - titleOffset);
return true;
}
/* Return article by url */
bool Reader::getArticleObjectByDecodedUrl(const string &url, zim::Article &article) {
bool retVal = false;
if (this->zimFileHandler != NULL) {
/* Parse the url */
char ns = 0;
string titleStr;
this->parseUrl(url, &ns, titleStr);
/* Main page */
if (titleStr.empty() && ns == 0) {
this->parseUrl(this->getMainPageUrl(), &ns, titleStr);
}
/* Extract the content from the zim file */
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findx(ns, titleStr);
/* Test if the article was found */
if (resultPair.first == true) {
article = zimFileHandler->getArticle(resultPair.second.getIndex());
retVal = true;
}
}
return retVal;
}
/* Return the mimeType without the content */
bool Reader::getMimeTypeByUrl(const string &url, string &mimeType) {
bool retVal = false;
if (this->zimFileHandler != NULL) {
zim::Article article;
if (this->getArticleObjectByDecodedUrl(url, article)) {
try {
mimeType = string(article.getMimeType().data(), article.getMimeType().size());
} catch (exception &e) {
cerr << "Unable to get the mimetype for "<< url << ":" << e.what() << endl;
mimeType = "application/octet-stream";
}
retVal = true;
} else {
mimeType = "";
}
}
return retVal;
}
/* Get a content from a zim file */
bool Reader::getContentByUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) {
return this->getContentByEncodedUrl(url, content, contentLength, contentType);
}
bool Reader::getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) {
return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, baseUrl);
}
bool Reader::getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) {
std::string stubRedirectUrl;
return this->getContentByEncodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, stubRedirectUrl);
}
bool Reader::getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) {
std::string stubRedirectUrl;
return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, stubRedirectUrl);
}
bool Reader::getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) {
bool retVal = false;
content="";
contentType="";
contentLength = 0;
if (this->zimFileHandler != NULL) {
zim::Article article;
if (this->getArticleObjectByDecodedUrl(url, article)) {
/* If redirect */
unsigned int loopCounter = 0;
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
if (loopCounter < 42) {
/* Compute base url (might be different from the url if redirects */
baseUrl = "/" + std::string(1, article.getNamespace()) + "/" + article.getUrl();
/* Get the content mime-type */
try {
contentType = string(article.getMimeType().data(), article.getMimeType().size());
} catch (exception &e) {
cerr << "Unable to get the mimetype for "<< baseUrl<< ":" << e.what() << endl;
contentType = "application/octet-stream";
}
/* Get the data */
content = string(article.getData().data(), article.getArticleSize());
}
/* Try to set a stub HTML header/footer if necesssary */
if (contentType.find("text/html") != string::npos &&
content.find("<body") == std::string::npos &&
content.find("<BODY") == std::string::npos) {
content = "<html><head><title>" + article.getTitle() + "</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /></head><body>" + content + "</body></html>";
}
/* Get the data length */
contentLength = article.getArticleSize();
/* Set return value */
retVal = true;
}
}
return retVal;
}
/* Check if an article exists */
bool Reader::urlExists(const string &url) {
char ns = 0;
string titleStr;
this->parseUrl(url, &ns, titleStr);
zim::File::const_iterator findItr = zimFileHandler->find(ns, titleStr);
return findItr->getUrl() == titleStr;
}
/* Search titles by prefix */
bool Reader::searchSuggestions(const string &prefix, unsigned int suggestionsCount, const bool reset) {
bool retVal = false;
zim::File::const_iterator articleItr;
/* Reset the suggestions otherwise check if the suggestions number is less than the suggestionsCount */
if (reset) {
this->suggestions.clear();
this->suggestionsOffset = this->suggestions.begin();
} else {
if (this->suggestions.size() > suggestionsCount) {
return false;
}
}
/* Return if no prefix */
if (prefix.size() == 0) {
return false;
}
for (articleItr = zimFileHandler->findByTitle('A', prefix);
articleItr != zimFileHandler->end() &&
articleItr->getTitle().compare(0, prefix.size(), prefix) == 0 &&
this->suggestions.size() < suggestionsCount ;
++articleItr) {
/* Extract the interesting part of article title & url */
std::string normalizedArticleTitle = kiwix::normalize(articleItr->getTitle());
std::string articleFinalUrl = "/A/"+articleItr->getUrl();
if (articleItr->isRedirect()) {
zim::Article article = *articleItr;
unsigned int loopCounter = 0;
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
articleFinalUrl = "/A/"+article.getUrl();
}
/* Go through all already found suggestions and skip if this
article is already in the suggestions list (with an other
title) */
bool insert = true;
std::vector< std::vector<std::string> >::iterator suggestionItr;
for (suggestionItr = this->suggestions.begin(); suggestionItr != this->suggestions.end(); suggestionItr++) {
int result = normalizedArticleTitle.compare((*suggestionItr)[2]);
if (result == 0 && articleFinalUrl.compare((*suggestionItr)[1]) == 0) {
insert = false;
break;
} else if (result < 0) {
break;
}
}
/* Insert if possible */
if (insert) {
std::vector<std::string> suggestion;
suggestion.push_back(articleItr->getTitle());
suggestion.push_back(articleFinalUrl);
suggestion.push_back(normalizedArticleTitle);
this->suggestions.insert(suggestionItr, suggestion);
}
/* Suggestions where found */
retVal = true;
}
/* Set the cursor to the begining */
this->suggestionsOffset = this->suggestions.begin();
return retVal;
}
std::vector<std::string> Reader::getTitleVariants(const std::string &title) {
std::vector<std::string> variants;
variants.push_back(title);
variants.push_back(kiwix::ucFirst(title));
variants.push_back(kiwix::lcFirst(title));
variants.push_back(kiwix::toTitle(title));
return variants;
}
/* Try also a few variations of the prefix to have better results */
bool Reader::searchSuggestionsSmart(const string &prefix, unsigned int suggestionsCount) {
std::vector<std::string> variants = this->getTitleVariants(prefix);
bool retVal;
this->suggestions.clear();
this->suggestionsOffset = this->suggestions.begin();
for (std::vector<std::string>::iterator variantsItr = variants.begin();
variantsItr != variants.end();
variantsItr++) {
retVal = this->searchSuggestions(*variantsItr, suggestionsCount, false) || retVal;
}
return retVal;
}
/* Get next suggestion */
bool Reader::getNextSuggestion(string &title) {
if (this->suggestionsOffset != this->suggestions.end()) {
/* title */
title = (*(this->suggestionsOffset))[0];
/* increment the cursor for the next call */
this->suggestionsOffset++;
return true;
}
return false;
}
bool Reader::getNextSuggestion(string &title, string &url) {
if (this->suggestionsOffset != this->suggestions.end()) {
/* title */
title = (*(this->suggestionsOffset))[0];
url = (*(this->suggestionsOffset))[1];
/* increment the cursor for the next call */
this->suggestionsOffset++;
return true;
}
return false;
}
/* Check if the file has as checksum */
bool Reader::canCheckIntegrity() {
return this->zimFileHandler->getChecksum() != "";
}
/* Return true if corrupted, false otherwise */
bool Reader::isCorrupted() {
try {
if (this->zimFileHandler->verify() == true)
return false;
} catch (exception &e) {
cerr << e.what() << endl;
return true;
}
return true;
}
/* Return the file size, works also for splitted files */
unsigned int Reader::getFileSize() {
zim::File *file = this->getZimFileHandler();
zim::offset_type size = 0;
if (file != NULL) {
size = file->getFilesize();
}
return (size / 1024);
}
}