/* * Copyright 2011 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "reader.h" #include #include #include #include #include #include "tools.h" #include "tools/stringTools.h" #include "tools/otherTools.h" #include "tools/archiveTools.h" namespace kiwix { /* Constructor */ Reader::Reader(const string zimFilePath) : zimArchive(nullptr), zimFilePath(zimFilePath) { string tmpZimFilePath = zimFilePath; /* Remove potential trailing zimaa */ size_t found = tmpZimFilePath.rfind("zimaa"); if (found != string::npos && tmpZimFilePath.size() > 5 && found == tmpZimFilePath.size() - 5) { tmpZimFilePath.resize(tmpZimFilePath.size() - 2); } zimArchive.reset(new zim::Archive(tmpZimFilePath)); /* initialize random seed: */ srand(time(nullptr)); } Reader::Reader(const std::shared_ptr archive, bool _marker) : zimArchive(archive), zimFilePath(archive->getFilename()) {} #ifndef _WIN32 Reader::Reader(int fd) : zimArchive(new zim::Archive(fd)), zimFilePath("") { /* initialize random seed: */ srand(time(nullptr)); } Reader::Reader(int fd, zim::offset_type offset, zim::size_type size) : zimArchive(new zim::Archive(fd, offset, size)), zimFilePath("") { /* initialize random seed: */ srand(time(nullptr)); } #endif // #ifndef _WIN32 zim::Archive* Reader::getZimArchive() const { return zimArchive.get(); } MimeCounterType Reader::parseCounterMetadata() const { return kiwix::parseArchiveCounter(*zimArchive); } /* Get the count of articles which can be indexed/displayed */ unsigned int Reader::getArticleCount() const { std::map counterMap = this->parseCounterMetadata(); unsigned int counter = 0; for(auto &pair:counterMap) { if (startsWith(pair.first, "text/html")) { counter += pair.second; } } return counter; } /* Get the count of medias content in the ZIM file */ unsigned int Reader::getMediaCount() const { return kiwix::getArchiveMediaCount(*zimArchive); } /* Get the total of all items of a ZIM file, redirects included */ unsigned int Reader::getGlobalCount() const { return zimArchive->getEntryCount(); } /* Return the UID of the ZIM file */ string Reader::getId() const { return kiwix::getArchiveId(*zimArchive); } Entry Reader::getRandomPage() const { try { return Entry(zimArchive->getRandomEntry(), true); } catch(...) { throw NoEntry(); } } Entry Reader::getMainPage() const { return Entry(zimArchive->getMainEntry(), true); } bool Reader::getFavicon(string& content, string& mimeType) const { return kiwix::getArchiveFavicon(*zimArchive, 48, content, mimeType); } string Reader::getZimFilePath() const { return zimFilePath; } /* Return a metatag value */ bool Reader::getMetadata(const string& name, string& value) const { try { value = zimArchive->getMetadata(name); return true; } catch(zim::EntryNotFound& e) { return false; } } #define METADATA(NAME) std::string v; getMetadata(NAME, v); return v; string Reader::getName() const { return kiwix::getMetaName(*zimArchive); } string Reader::getTitle() const { return kiwix::getArchiveTitle(*zimArchive); } string Reader::getCreator() const { return kiwix::getMetaCreator(*zimArchive); } string Reader::getPublisher() const { return kiwix::getMetaPublisher(*zimArchive); } string Reader::getDate() const { return kiwix::getMetaDate(*zimArchive); } string Reader::getDescription() const { return kiwix::getMetaDescription(*zimArchive); } string Reader::getLongDescription() const { METADATA("LongDescription") } string Reader::getLanguage() const { return kiwix::getMetaLanguage(*zimArchive); } string Reader::getLicense() const { METADATA("License") } string Reader::getTags(bool original) const { return kiwix::getMetaTags(*zimArchive, original); } string Reader::getTagStr(const std::string& tagName) const { string tags_str; getMetadata("Tags", tags_str); return getTagValueFromTagList(convertTags(tags_str), tagName); } bool Reader::getTagBool(const std::string& tagName) const { return convertStrToBool(getTagStr(tagName)); } string Reader::getRelation() const { METADATA("Relation") } string Reader::getFlavour() const { return kiwix::getMetaFlavour(*zimArchive); } string Reader::getSource() const { METADATA("Source") } string Reader::getScraper() const { METADATA("Scraper") } #undef METADATA Entry Reader::getEntryFromPath(const std::string& path) const { try { return Entry(kiwix::getEntryFromPath(*zimArchive, path), true); } catch (zim::EntryNotFound& e) { throw NoEntry(); } } Entry Reader::getEntryFromEncodedPath(const std::string& path) const { return getEntryFromPath(urlDecode(path, true)); } Entry Reader::getEntryFromTitle(const std::string& title) const { try { return Entry(zimArchive->getEntryByTitle(title), true); } catch(zim::EntryNotFound& e) { throw NoEntry(); } } bool Reader::pathExists(const string& path) const { return zimArchive->hasEntryByPath(path); } /* Does the ZIM file has a fulltext index */ bool Reader::hasFulltextIndex() const { return zimArchive->hasFulltextIndex(); } /* Search titles by prefix */ bool Reader::searchSuggestions(const string& prefix, unsigned int suggestionsCount, const bool reset) { /* Reset the suggestions otherwise check if the suggestions number is less * than the suggestionsCount */ if (reset) { this->suggestions.clear(); this->suggestionsOffset = this->suggestions.begin(); } else { if (this->suggestions.size() > suggestionsCount) { return false; } } auto ret = searchSuggestions(prefix, suggestionsCount, this->suggestions); /* Set the cursor to the begining */ this->suggestionsOffset = this->suggestions.begin(); return ret; } bool Reader::searchSuggestions(const string& prefix, unsigned int suggestionsCount, SuggestionsList_t& results) { bool retVal = false; /* Return if no prefix */ if (prefix.size() == 0) { return false; } for (auto& entry: zimArchive->findByTitle(prefix)) { if (results.size() >= suggestionsCount) { break; } /* Extract the interesting part of article title & url */ std::string normalizedArticleTitle = kiwix::normalize(entry.getTitle()); // Get the final path. auto item = entry.getItem(true); std::string articleFinalUrl = item.getPath(); /* Go through all already found suggestions and skip if this article is already in the suggestions list (with an other title) */ bool insert = true; std::vector::iterator suggestionItr; for (suggestionItr = results.begin(); suggestionItr != results.end(); suggestionItr++) { int result = normalizedArticleTitle.compare((*suggestionItr).getNormalizedTitle()); if (result == 0 && articleFinalUrl.compare((*suggestionItr).getPath()) == 0) { insert = false; break; } else if (result < 0) { break; } } /* Insert if possible */ if (insert) { SuggestionItem suggestion(entry.getTitle(), normalizedArticleTitle, articleFinalUrl); results.insert(suggestionItr, suggestion); } /* Suggestions where found */ retVal = true; } return retVal; } std::vector Reader::getTitleVariants( const std::string& title) const { return kiwix::getTitleVariants(title); } bool Reader::searchSuggestionsSmart(const string& prefix, unsigned int suggestionsCount) { this->suggestions.clear(); this->suggestionsOffset = this->suggestions.begin(); auto ret = searchSuggestionsSmart(prefix, suggestionsCount, this->suggestions); this->suggestionsOffset = this->suggestions.begin(); return ret; } /* Try also a few variations of the prefix to have better results */ bool Reader::searchSuggestionsSmart(const string& prefix, unsigned int suggestionsCount, SuggestionsList_t& results) { std::vector variants = this->getTitleVariants(prefix); auto suggestionSearcher = zim::SuggestionSearcher(*zimArchive); if (zimArchive->hasTitleIndex()) { auto suggestionSearch = suggestionSearcher.suggest(prefix); const auto suggestions = suggestionSearch.getResults(0, suggestionsCount); for (auto current : suggestions) { SuggestionItem suggestion(current.getTitle(), kiwix::normalize(current.getTitle()), current.getPath(), current.getSnippet()); results.push_back(suggestion); } } else { // Check some of the variants of the prefix for (std::vector::iterator variantsItr = variants.begin(); variantsItr != variants.end(); variantsItr++) { auto suggestionSearch = suggestionSearcher.suggest(*variantsItr); for (auto current : suggestionSearch.getResults(0, suggestionsCount)) { if (results.size() >= suggestionsCount) { break; } SuggestionItem suggestion(current.getTitle(), kiwix::normalize(current.getTitle()), current.getPath(), current.getSnippet()); results.push_back(suggestion); } } } return results.size() > 0; } /* Get next suggestion */ bool Reader::getNextSuggestion(string& title) { if (this->suggestionsOffset != this->suggestions.end()) { /* title */ title = (*(this->suggestionsOffset)).getTitle(); /* increment the cursor for the next call */ this->suggestionsOffset++; return true; } return false; } bool Reader::getNextSuggestion(string& title, string& url) { if (this->suggestionsOffset != this->suggestions.end()) { /* title */ title = (*(this->suggestionsOffset)).getTitle(); url = (*(this->suggestionsOffset)).getPath(); /* increment the cursor for the next call */ this->suggestionsOffset++; return true; } return false; } /* Check if the file has as checksum */ bool Reader::canCheckIntegrity() const { return zimArchive->hasChecksum(); } /* Return true if corrupted, false otherwise */ bool Reader::isCorrupted() const { try { if (zimArchive->check() == true) { return false; } } catch (exception& e) { cerr << e.what() << endl; return true; } return true; } /* Return the file size, works also for splitted files */ unsigned int Reader::getFileSize() const { return kiwix::getArchiveFileSize(*zimArchive); } }