/* * Copyright 2011 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "reader.h" #include #include inline char hi(char v) { char hex[] = "0123456789abcdef"; return hex[(v >> 4) & 0xf]; } inline char lo(char v) { char hex[] = "0123456789abcdef"; return hex[v & 0xf]; } std::string hexUUID(std::string in) { std::ostringstream out; for (unsigned n = 0; n < 4; ++n) { out << hi(in[n]) << lo(in[n]); } out << '-'; for (unsigned n = 4; n < 6; ++n) { out << hi(in[n]) << lo(in[n]); } out << '-'; for (unsigned n = 6; n < 8; ++n) { out << hi(in[n]) << lo(in[n]); } out << '-'; for (unsigned n = 8; n < 10; ++n) { out << hi(in[n]) << lo(in[n]); } out << '-'; for (unsigned n = 10; n < 16; ++n) { out << hi(in[n]) << lo(in[n]); } std::string op = out.str(); return op; } namespace kiwix { /* Constructor */ Reader::Reader(const string zimFilePath) : zimFileHandler(NULL) { string tmpZimFilePath = zimFilePath; /* Remove potential trailing zimaa */ size_t found = tmpZimFilePath.rfind("zimaa"); if (found != string::npos && tmpZimFilePath.size() > 5 && found == tmpZimFilePath.size() - 5) { tmpZimFilePath.resize(tmpZimFilePath.size() - 2); } this->zimFileHandler = new zim::File(tmpZimFilePath); if (this->zimFileHandler != NULL) { this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A'); this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A'); this->nsACount = this->zimFileHandler->getNamespaceCount('A'); this->nsICount = this->zimFileHandler->getNamespaceCount('I'); this->zimFilePath = zimFilePath; } /* initialize random seed: */ srand(time(NULL)); } /* Destructor */ Reader::~Reader() { if (this->zimFileHandler != NULL) { delete this->zimFileHandler; } } zim::File* Reader::getZimFileHandler() const { return this->zimFileHandler; } std::map Reader::parseCounterMetadata() const { std::map counters; string mimeType, item, counterString; unsigned int counter; zim::Article article = this->zimFileHandler->getArticle('M', "Counter"); if (article.good()) { stringstream ssContent(article.getData()); while (getline(ssContent, item, ';')) { stringstream ssItem(item); getline(ssItem, mimeType, '='); getline(ssItem, counterString, '='); if (!counterString.empty() && !mimeType.empty()) { sscanf(counterString.c_str(), "%u", &counter); counters.insert(pair(mimeType, counter)); } } } return counters; } /* Get the count of articles which can be indexed/displayed */ unsigned int Reader::getArticleCount() const { std::map counterMap = this->parseCounterMetadata(); unsigned int counter = 0; if (counterMap.empty()) { counter = this->nsACount; } else { auto it = counterMap.find("text/html"); if (it != counterMap.end()) { counter = it->second; } } return counter; } /* Get the count of medias content in the ZIM file */ unsigned int Reader::getMediaCount() const { std::map counterMap = this->parseCounterMetadata(); unsigned int counter = 0; if (counterMap.empty()) { counter = this->nsICount; } else { auto it = counterMap.find("image/jpeg"); if (it != counterMap.end()) { counter += it->second; } it = counterMap.find("image/gif"); if (it != counterMap.end()) { counter += it->second; } it = counterMap.find("image/png"); if (it != counterMap.end()) { counter += it->second; } } return counter; } /* Get the total of all items of a ZIM file, redirects included */ unsigned int Reader::getGlobalCount() const { return this->zimFileHandler->getCountArticles(); } /* Return the UID of the ZIM file */ string Reader::getId() const { std::ostringstream s; s << this->zimFileHandler->getFileheader().getUuid(); return s.str(); } /* Return a page url from a title */ bool Reader::getPageUrlFromTitle(const string& title, string& url) const { /* Extract the content from the zim file */ zim::Article article = this->zimFileHandler->getArticleByTitle('A', title); if (!article.good()) { return false; } unsigned int loopCounter = 0; while (article.isRedirect() && loopCounter++ < 42) { article = article.getRedirectArticle(); } url = article.getLongUrl(); return true; } /* Return an URL from a title */ string Reader::getRandomPageUrl() const { zim::Article article; zim::size_type idx; std::string mainPageUrl = this->getMainPageUrl(); do { idx = this->firstArticleOffset + (zim::size_type)((double)rand() / ((double)RAND_MAX + 1) * this->nsACount); article = zimFileHandler->getArticle(idx); } while (article.getLongUrl() == mainPageUrl); return article.getLongUrl(); } /* Return the welcome page URL */ string Reader::getMainPageUrl() const { string url = ""; if (this->zimFileHandler->getFileheader().hasMainPage()) { zim::Article article = zimFileHandler->getArticle( this->zimFileHandler->getFileheader().getMainPage()); url = article.getLongUrl(); if (url.empty()) { url = getFirstPageUrl(); } } else { url = getFirstPageUrl(); } return url; } bool Reader::getFavicon(string& content, string& mimeType) const { unsigned int contentLength = 0; string title; this->getContentByUrl("/-/favicon.png", content, title, contentLength, mimeType); if (content.empty()) { this->getContentByUrl("/I/favicon.png", content, title, contentLength, mimeType); if (content.empty()) { this->getContentByUrl("/I/favicon", content, title, contentLength, mimeType); if (content.empty()) { this->getContentByUrl("/-/favicon", content, title, contentLength, mimeType); } } } return content.empty() ? false : true; } string Reader::getZimFilePath() const { return this->zimFilePath; } /* Return a metatag value */ bool Reader::getMetatag(const string& name, string& value) const { unsigned int contentLength = 0; string contentType = ""; string title; return this->getContentByUrl("/M/" + name, value, title, contentLength, contentType); } string Reader::getTitle() const { string value; this->getMetatag("Title", value); if (value.empty()) { value = getLastPathElement(zimFileHandler->getFilename()); std::replace(value.begin(), value.end(), '_', ' '); size_t pos = value.find(".zim"); value = value.substr(0, pos); } return value; } string Reader::getName() const { string value; this->getMetatag("Name", value); return value; } string Reader::getTags() const { string value; this->getMetatag("Tags", value); return value; } string Reader::getDescription() const { string value; this->getMetatag("Description", value); /* Mediawiki Collection tends to use the "Subtitle" name */ if (value.empty()) { this->getMetatag("Subtitle", value); } return value; } string Reader::getLanguage() const { string value; this->getMetatag("Language", value); return value; } string Reader::getDate() const { string value; this->getMetatag("Date", value); return value; } string Reader::getCreator() const { string value; this->getMetatag("Creator", value); return value; } string Reader::getPublisher() const { string value; this->getMetatag("Publisher", value); return value; } string Reader::getOrigId() const { string value; this->getMetatag("startfileuid", value); if (value.empty()) { return ""; } std::string id = value; std::string origID; std::string temp = ""; unsigned int k = 0; char tempArray[16] = ""; for (unsigned int i = 0; i < id.size(); i++) { if (id[i] == '\n') { tempArray[k] = atoi(temp.c_str()); temp = ""; k++; } else { temp += id[i]; } } origID = hexUUID(tempArray); return origID; } /* Return the first page URL */ string Reader::getFirstPageUrl() const { zim::size_type firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A'); zim::Article article = zimFileHandler->getArticle(firstPageOffset); return article.getLongUrl(); } bool Reader::parseUrl(const string& url, char* ns, string& title) const { /* Offset to visit the url */ unsigned int urlLength = url.size(); unsigned int offset = 0; /* Ignore the '/' */ while ((offset < urlLength) && (url[offset] == '/')) { offset++; } /* Get namespace */ while ((offset < urlLength) && (url[offset] != '/')) { *ns = url[offset]; offset++; } /* Ignore the '/' */ while ((offset < urlLength) && (url[offset] == '/')) { offset++; } /* Get content title */ unsigned int titleOffset = offset; while (offset < urlLength) { offset++; } /* unescape title */ title = url.substr(titleOffset, offset - titleOffset); return true; } /* Return article by url */ bool Reader::getArticleObjectByDecodedUrl(const string& url, zim::Article& article) const { if (this->zimFileHandler == NULL) { return false; } /* Parse the url */ char ns = 0; string urlStr; this->parseUrl(url, &ns, urlStr); /* Main page */ if (urlStr.empty() && ns == 0) { this->parseUrl(this->getMainPageUrl(), &ns, urlStr); } /* Extract the content from the zim file */ article = zimFileHandler->getArticle(ns, urlStr); return article.good(); } /* Return the mimeType without the content */ bool Reader::getMimeTypeByUrl(const string& url, string& mimeType) const { if (this->zimFileHandler == NULL) { return false; } zim::Article article; if (this->getArticleObjectByDecodedUrl(url, article)) { try { mimeType = article.getMimeType(); } catch (exception& e) { cerr << "Unable to get the mimetype for " << url << ":" << e.what() << endl; mimeType = "application/octet-stream"; } return true; } else { mimeType = ""; return false; } } /* Get a content from a zim file */ bool Reader::getContentByUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType) const { return this->getContentByEncodedUrl(url, content, title, contentLength, contentType); } bool Reader::getContentByEncodedUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType, string& baseUrl) const { return this->getContentByDecodedUrl( kiwix::urlDecode(url), content, title, contentLength, contentType, baseUrl); } bool Reader::getContentByEncodedUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType) const { std::string stubRedirectUrl; return this->getContentByEncodedUrl(kiwix::urlDecode(url), content, title, contentLength, contentType, stubRedirectUrl); } bool Reader::getContentByDecodedUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType) const { std::string stubRedirectUrl; return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, title, contentLength, contentType, stubRedirectUrl); } bool Reader::getContentByDecodedUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType, string& baseUrl) const { content = ""; contentType = ""; contentLength = 0; zim::Article article; if (!this->getArticleObjectByDecodedUrl(url, article)) { return false; } /* If redirect */ unsigned int loopCounter = 0; while (article.isRedirect() && loopCounter++ < 42) { article = article.getRedirectArticle(); } if (loopCounter < 42) { /* Compute base url (might be different from the url if redirects */ baseUrl = "/" + std::string(1, article.getNamespace()) + "/" + article.getUrl(); /* Get the content mime-type */ try { contentType = string(article.getMimeType().data(), article.getMimeType().size()); } catch (exception& e) { cerr << "Unable to get the mimetype for " << baseUrl << ":" << e.what() << endl; contentType = "application/octet-stream"; } /* Get the data */ content = string(article.getData().data(), article.getArticleSize()); title = article.getTitle(); } /* Try to set a stub HTML header/footer if necesssary */ if (contentType.find("text/html") != string::npos && content.find("" + content + ""; } /* Get the data length */ contentLength = article.getArticleSize(); return true; } /* Check if an article exists */ bool Reader::urlExists(const string& url) const { char ns = 0; string titleStr; this->parseUrl(url, &ns, titleStr); titleStr = "/" + titleStr; zim::File::const_iterator findItr = zimFileHandler->find(ns, titleStr); return findItr != zimFileHandler->end() && findItr->getUrl() == titleStr; } /* Does the ZIM file has a fulltext index */ bool Reader::hasFulltextIndex() const { return ( this->urlExists("/Z/fulltextIndex/xapian") && !zimFileHandler->is_multiPart() ); } /* Search titles by prefix */ bool Reader::searchSuggestions(const string& prefix, unsigned int suggestionsCount, const bool reset) { bool retVal = false; zim::File::const_iterator articleItr; /* Reset the suggestions otherwise check if the suggestions number is less * than the suggestionsCount */ if (reset) { this->suggestions.clear(); this->suggestionsOffset = this->suggestions.begin(); } else { if (this->suggestions.size() > suggestionsCount) { return false; } } /* Return if no prefix */ if (prefix.size() == 0) { return false; } for (articleItr = zimFileHandler->findByTitle('A', prefix); articleItr != zimFileHandler->end() && articleItr->getTitle().compare(0, prefix.size(), prefix) == 0 && this->suggestions.size() < suggestionsCount; ++articleItr) { /* Extract the interesting part of article title & url */ std::string normalizedArticleTitle = kiwix::normalize(articleItr->getTitle()); std::string articleFinalUrl = "/A/" + articleItr->getUrl(); if (articleItr->isRedirect()) { zim::Article article = *articleItr; unsigned int loopCounter = 0; while (article.isRedirect() && loopCounter++ < 42) { article = article.getRedirectArticle(); } articleFinalUrl = "/A/" + article.getUrl(); } /* Go through all already found suggestions and skip if this article is already in the suggestions list (with an other title) */ bool insert = true; std::vector>::iterator suggestionItr; for (suggestionItr = this->suggestions.begin(); suggestionItr != this->suggestions.end(); suggestionItr++) { int result = normalizedArticleTitle.compare((*suggestionItr)[2]); if (result == 0 && articleFinalUrl.compare((*suggestionItr)[1]) == 0) { insert = false; break; } else if (result < 0) { break; } } /* Insert if possible */ if (insert) { std::vector suggestion; suggestion.push_back(articleItr->getTitle()); suggestion.push_back(articleFinalUrl); suggestion.push_back(normalizedArticleTitle); this->suggestions.insert(suggestionItr, suggestion); } /* Suggestions where found */ retVal = true; } /* Set the cursor to the begining */ this->suggestionsOffset = this->suggestions.begin(); return retVal; } std::vector Reader::getTitleVariants( const std::string& title) const { std::vector variants; variants.push_back(title); variants.push_back(kiwix::ucFirst(title)); variants.push_back(kiwix::lcFirst(title)); variants.push_back(kiwix::toTitle(title)); return variants; } /* Try also a few variations of the prefix to have better results */ bool Reader::searchSuggestionsSmart(const string& prefix, unsigned int suggestionsCount) { std::vector variants = this->getTitleVariants(prefix); bool retVal; this->suggestions.clear(); this->suggestionsOffset = this->suggestions.begin(); /* Try to search in the title using fulltext search database */ const zim::Search* suggestionSearch = this->getZimFileHandler()->suggestions(prefix, 0, suggestionsCount); if (suggestionSearch->get_matches_estimated()) { for (auto current = suggestionSearch->begin(); current != suggestionSearch->end(); current++) { std::vector suggestion; suggestion.push_back(current->getTitle()); suggestion.push_back("/A/" + current->getUrl()); suggestion.push_back(kiwix::normalize(current->getTitle())); this->suggestions.push_back(suggestion); } this->suggestionsOffset = this->suggestions.begin(); retVal = true; } else { for (std::vector::iterator variantsItr = variants.begin(); variantsItr != variants.end(); variantsItr++) { retVal = this->searchSuggestions(*variantsItr, suggestionsCount, false) || retVal; } } return retVal; } /* Get next suggestion */ bool Reader::getNextSuggestion(string& title) { if (this->suggestionsOffset != this->suggestions.end()) { /* title */ title = (*(this->suggestionsOffset))[0]; /* increment the cursor for the next call */ this->suggestionsOffset++; return true; } return false; } bool Reader::getNextSuggestion(string& title, string& url) { if (this->suggestionsOffset != this->suggestions.end()) { /* title */ title = (*(this->suggestionsOffset))[0]; url = (*(this->suggestionsOffset))[1]; /* increment the cursor for the next call */ this->suggestionsOffset++; return true; } return false; } /* Check if the file has as checksum */ bool Reader::canCheckIntegrity() const { return this->zimFileHandler->getChecksum() != ""; } /* Return true if corrupted, false otherwise */ bool Reader::isCorrupted() const { try { if (this->zimFileHandler->verify() == true) { return false; } } catch (exception& e) { cerr << e.what() << endl; return true; } return true; } /* Return the file size, works also for splitted files */ unsigned int Reader::getFileSize() const { zim::File* file = this->getZimFileHandler(); zim::offset_type size = 0; if (file != NULL) { size = file->getFilesize(); } return (size / 1024); } }