libkiwix/src/reader.cpp

/*
 * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU  General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301, USA.
 */

#include "reader.h"
#include <time.h>

#include <zim/search.h>
#include <zim/item.h>
#include <zim/error.h>

#include "tools/otherTools.h"

inline char hi(char v)
{
  char hex[] = "0123456789abcdef";
  return hex[(v >> 4) & 0xf];
}

inline char lo(char v)
{
  char hex[] = "0123456789abcdef";
  return hex[v & 0xf];
}

std::string hexUUID(std::string in)
{
  std::ostringstream out;
  for (unsigned n = 0; n < 4; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  out << '-';
  for (unsigned n = 4; n < 6; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  out << '-';
  for (unsigned n = 6; n < 8; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  out << '-';
  for (unsigned n = 8; n < 10; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  out << '-';
  for (unsigned n = 10; n < 16; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  std::string op = out.str();
  return op;
}

namespace kiwix
{
/* Constructor */
Reader::Reader(const string zimFilePath)
  :  zimArchive(nullptr),
     zimFilePath(zimFilePath)
{
  string tmpZimFilePath = zimFilePath;

  /* Remove potential trailing zimaa */
  size_t found = tmpZimFilePath.rfind("zimaa");
  if (found != string::npos && tmpZimFilePath.size() > 5
      && found == tmpZimFilePath.size() - 5) {
    tmpZimFilePath.resize(tmpZimFilePath.size() - 2);
  }

  zimArchive.reset(new zim::Archive(tmpZimFilePath));

  /* initialize random seed: */
  srand(time(nullptr));
}

zim::Archive* Reader::getZimArchive() const
{
  return zimArchive.get();
}

MimeCounterType Reader::parseCounterMetadata() const
{
  auto counterContent = zimArchive->getMetadata("Counter");
  return parseMimetypeCounter(counterContent);
}

/* Get the count of articles which can be indexed/displayed */
unsigned int Reader::getArticleCount() const
{
  std::map<const std::string, unsigned int> counterMap
      = this->parseCounterMetadata();
  unsigned int counter = 0;

  for(auto &pair:counterMap) {
    if (startsWith(pair.first, "text/html")) {
      counter += pair.second;
    }
  }

  return counter;
}

/* Get the count of medias content in the ZIM file */
unsigned int Reader::getMediaCount() const
{
  std::map<const std::string, unsigned int> counterMap
      = this->parseCounterMetadata();
  unsigned int counter = 0;

  auto it = counterMap.find("image/jpeg");
  if (it != counterMap.end()) {
    counter += it->second;
  }

  it = counterMap.find("image/gif");
  if (it != counterMap.end()) {
    counter += it->second;
  }

  it = counterMap.find("image/png");
  if (it != counterMap.end()) {
    counter += it->second;
  }
  return counter;
}

/* Get the total of all items of a ZIM file, redirects included */
unsigned int Reader::getGlobalCount() const
{
  return zimArchive->getEntryCount();
}

/* Return the UID of the ZIM file */
string Reader::getId() const
{
  std::ostringstream s;
  s << zimArchive->getUuid();
  return s.str();
}

Entry Reader::getRandomPage() const
{
  auto mainPagePath = zimArchive->getMainEntry().getPath();
  int watchdog = 42;

  while (--watchdog){
    auto idx = (zim::size_type)((double)rand() / ((double)RAND_MAX + 1)
                              * zimArchive->getEntryCount());
    auto entry = zimArchive->getEntryByPath(idx);

    if (entry.getPath()==mainPagePath) {
      continue;
    }
    auto item = entry.getItem(true);
    if (item.getMimetype() == "text/html") {
      return entry;
    }
  }
  throw NoEntry();
}

Entry Reader::getMainPage() const
{
  return zimArchive->getMainEntry();
}

bool Reader::getFavicon(string& content, string& mimeType) const
{
  try {
    auto entry = zimArchive->getFaviconEntry();
    auto item = entry.getItem(true);
    content = item.getData();
    mimeType = item.getMimetype();
    return true;
  } catch(zim::EntryNotFound& e) {};

  return false;
}

string Reader::getZimFilePath() const
{
  return zimFilePath;
}
/* Return a metatag value */
bool Reader::getMetadata(const string& name, string& value) const
{
  try {
    value = zimArchive->getMetadata(name);
    return true;
  } catch(zim::EntryNotFound& e) {
    return false;
  }
}

#define METADATA(NAME) std::string v; getMetadata(NAME, v); return v;

string Reader::getName() const
{
  METADATA("Name")
}

string Reader::getTitle() const
{
  string value = zimArchive->getMetadata("Title");
  if (value.empty()) {
    value = getLastPathElement(zimFilePath);
    std::replace(value.begin(), value.end(), '_', ' ');
    size_t pos = value.find(".zim");
    value = value.substr(0, pos);
  }
  return value;
}

string Reader::getCreator() const
{
  METADATA("Creator")
}

string Reader::getPublisher() const
{
  METADATA("Publisher")
}

string Reader::getDate() const
{
  METADATA("Date")
}

string Reader::getDescription() const
{
  string value;
  this->getMetadata("Description", value);

  /* Mediawiki Collection tends to use the "Subtitle" name */
  if (value.empty()) {
    this->getMetadata("Subtitle", value);
  }

  return value;
}

string Reader::getLongDescription() const
{
  METADATA("LongDescription")
}

string Reader::getLanguage() const
{
  METADATA("Language")
}

string Reader::getLicense() const
{
  METADATA("License")
}

string Reader::getTags(bool original) const
{
  string tags_str;
  getMetadata("Tags", tags_str);
  if (original) {
    return tags_str;
  }
  auto tags = convertTags(tags_str);
  return join(tags, ";");
}


string Reader::getTagStr(const std::string& tagName) const
{
  string tags_str;
  getMetadata("Tags", tags_str);
  return getTagValueFromTagList(convertTags(tags_str), tagName);
}

bool Reader::getTagBool(const std::string& tagName) const
{
  return convertStrToBool(getTagStr(tagName));
}

string Reader::getRelation() const
{
  METADATA("Relation")
}

string Reader::getFlavour() const
{
  METADATA("Flavour")
}

string Reader::getSource() const
{
  METADATA("Source")
}

string Reader::getScraper() const
{
  METADATA("Scraper")
}
#undef METADATA

string Reader::getOrigId() const
{
  string value;
  this->getMetadata("startfileuid", value);
  if (value.empty()) {
    return "";
  }
  std::string id = value;
  std::string origID;
  std::string temp = "";
  unsigned int k = 0;
  char tempArray[16] = "";
  for (unsigned int i = 0; i < id.size(); i++) {
    if (id[i] == '\n') {
      tempArray[k] = atoi(temp.c_str());
      temp = "";
      k++;
    } else {
      temp += id[i];
    }
  }
  origID = hexUUID(tempArray);
  return origID;
}

Entry Reader::getEntryFromPath(const std::string& path) const
{
  if (!this->zimArchive) {
    throw NoEntry();
  }

  if (path.empty() || path == "/") {
    return getMainPage();
  }

  try {
    return zimArchive->getEntryByPath(path);
  } catch (zim::EntryNotFound& e) {
    throw NoEntry();
  }
}

Entry Reader::getEntryFromEncodedPath(const std::string& path) const
{
  return getEntryFromPath(urlDecode(path, true));
}

Entry Reader::getEntryFromTitle(const std::string& title) const
{
  if (!this->zimArchive) {
    throw NoEntry();
  }

  try {
    return zimArchive->getEntryByTitle(title);
  } catch(zim::EntryNotFound& e) {
    throw NoEntry();
  }
}

bool Reader::pathExists(const string& path) const
{
  if (!zimArchive)
  {
    return false;
  }

  return zimArchive->hasEntryByPath(path);
}

/* Does the ZIM file has a fulltext index */
bool Reader::hasFulltextIndex() const
{
  if (!zimArchive)
  {
    return false;
  }

  return zimArchive->hasFulltextIndex();
}

/* Search titles by prefix */

bool Reader::searchSuggestions(const string& prefix,
                               unsigned int suggestionsCount,
                               const bool reset)
{
  /* Reset the suggestions otherwise check if the suggestions number is less
   * than the suggestionsCount */
  if (reset) {
    this->suggestions.clear();
    this->suggestionsOffset = this->suggestions.begin();
  } else {
    if (this->suggestions.size() > suggestionsCount) {
      return false;
    }
  }

  auto ret =  searchSuggestions(prefix, suggestionsCount, this->suggestions);

  /* Set the cursor to the begining */
  this->suggestionsOffset = this->suggestions.begin();

  return ret;
}


bool Reader::searchSuggestions(const string& prefix,
                               unsigned int suggestionsCount,
                               SuggestionsList_t& results)
{
  bool retVal = false;

  /* Return if no prefix */
  if (prefix.size() == 0) {
    return false;
  }

  for (auto& entry: zimArchive->findByTitle(prefix)) {
    if (results.size() >= suggestionsCount) {
      break;
    }
    /* Extract the interesting part of article title & url */
    std::string normalizedArticleTitle
        = kiwix::normalize(entry.getTitle());

    // Get the final path.
    auto item = entry.getItem(true);
    std::string articleFinalUrl = item.getPath();

    /* Go through all already found suggestions and skip if this
       article is already in the suggestions list (with an other
       title) */
    bool insert = true;
    std::vector<std::vector<std::string>>::iterator suggestionItr;
    for (suggestionItr = results.begin();
         suggestionItr != results.end();
         suggestionItr++) {
      int result = normalizedArticleTitle.compare((*suggestionItr)[2]);
      if (result == 0 && articleFinalUrl.compare((*suggestionItr)[1]) == 0) {
        insert = false;
        break;
      } else if (result < 0) {
        break;
      }
    }

    /* Insert if possible */
    if (insert) {
      std::vector<std::string> suggestion;
      suggestion.push_back(entry.getTitle());
      suggestion.push_back(articleFinalUrl);
      suggestion.push_back(normalizedArticleTitle);
      results.insert(suggestionItr, suggestion);
    }

    /* Suggestions where found */
    retVal = true;
  }

  return retVal;
}

std::vector<std::string> Reader::getTitleVariants(
    const std::string& title) const
{
  std::vector<std::string> variants;
  variants.push_back(title);
  variants.push_back(kiwix::ucFirst(title));
  variants.push_back(kiwix::lcFirst(title));
  variants.push_back(kiwix::toTitle(title));
  return variants;
}


bool Reader::searchSuggestionsSmart(const string& prefix,
                                    unsigned int suggestionsCount)
{
  this->suggestions.clear();
  this->suggestionsOffset = this->suggestions.begin();

  auto ret = searchSuggestionsSmart(prefix, suggestionsCount, this->suggestions);

  this->suggestionsOffset = this->suggestions.begin();

  return ret;
}

/* Try also a few variations of the prefix to have better results */
bool Reader::searchSuggestionsSmart(const string& prefix,
                                    unsigned int suggestionsCount,
                                    SuggestionsList_t& results)
{
  std::vector<std::string> variants = this->getTitleVariants(prefix);
  bool retVal = false;

  /* Try to search in the title using fulltext search database */
  auto suggestionSearch = zim::Search(*zimArchive);
  suggestionSearch.set_query(prefix);
  suggestionSearch.set_range(0, suggestionsCount);
  suggestionSearch.set_suggestion_mode(true);

  if (suggestionSearch.get_matches_estimated()) {
    for (auto current = suggestionSearch.begin();
         current != suggestionSearch.end();
         current++) {
      std::vector<std::string> suggestion;
      suggestion.push_back(current->getTitle());
      suggestion.push_back(current->getPath());
      suggestion.push_back(kiwix::normalize(current->getTitle()));
      results.push_back(suggestion);
    }
    retVal = true;
  } else {
    for (std::vector<std::string>::iterator variantsItr = variants.begin();
         variantsItr != variants.end();
         variantsItr++) {
      retVal = this->searchSuggestions(*variantsItr, suggestionsCount, results)
               || retVal;
    }
  }

  return retVal;
}

/* Get next suggestion */
bool Reader::getNextSuggestion(string& title)
{
  if (this->suggestionsOffset != this->suggestions.end()) {
    /* title */
    title = (*(this->suggestionsOffset))[0];

    /* increment the cursor for the next call */
    this->suggestionsOffset++;

    return true;
  }

  return false;
}

bool Reader::getNextSuggestion(string& title, string& url)
{
  if (this->suggestionsOffset != this->suggestions.end()) {
    /* title */
    title = (*(this->suggestionsOffset))[0];
    url = (*(this->suggestionsOffset))[1];

    /* increment the cursor for the next call */
    this->suggestionsOffset++;

    return true;
  }

  return false;
}

/* Check if the file has as checksum */
bool Reader::canCheckIntegrity() const
{
  return zimArchive->hasChecksum();
}

/* Return true if corrupted, false otherwise */
bool Reader::isCorrupted() const
{
  try {
    if (zimArchive->check() == true) {
      return false;
    }
  } catch (exception& e) {
    cerr << e.what() << endl;
    return true;
  }

  return true;
}

/* Return the file size, works also for splitted files */
unsigned int Reader::getFileSize() const
{
  if (!zimArchive) {
    return 0;
  }
  return zimArchive->getFilesize() / 1024;
}

}