Remove the indexer functionnality from kiwix-lib.

This is not used anymore.
2017-04-06 15:35:30 +02:00 · 2017-04-06 15:35:30 +02:00 · e28dbe7c7e
parent 2906202056
commit e28dbe7c7e
6 changed files with 2 additions and 860 deletions
--- a/include/indexer.h
+++ b/include/indexer.h
@ -1,169 +0,0 @@
 /*
 * Copyright 2014 Emmanuel Engelhart <kelson@kiwix.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU  General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301, USA.
 */
 #ifndef KIWIX_INDEXER_H
 #define KIWIX_INDEXER_H
 #include <string>
 #include <vector>
 #include <stack>
 #include <queue>
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <pthread.h>
 #include "common/stringTools.h"
 #include "common/otherTools.h"
 #include <zim/file.h>
 #include <zim/article.h>
 #include <zim/fileiterator.h>
 #include "reader.h"
 using namespace std;
 namespace kiwix {
  struct indexerToken {
    string url;
    string accentedTitle;
    string title;
    string keywords;
    string content;
    string snippet;
    string size;
    string wordCount;
  };
  class Indexer {
  typedef void (* ProgressCallback)(const unsigned int processedArticleCount, const unsigned int totalArticleCount);
  public:
    Indexer();
    virtual ~Indexer();
    bool start(const string zimPath, const string indexPath, ProgressCallback callback = NULL);
    bool stop();
    bool isRunning();
    unsigned int getProgression();
    void setVerboseFlag(const bool value);
  protected:
    virtual void indexingPrelude(const string indexPath) = 0;
    virtual void index(const string &url, 
 		       const string &title, 
 		       const string &unaccentedTitle,
 		       const string &keywords, 
 		       const string &content,
 		       const string &snippet,
 		       const string &size,
 		       const string &wordCount) = 0;
    virtual void flush() = 0;
    virtual void indexingPostlude(const string indexPath) = 0;
    /* Others */
    unsigned int countWords(const string &text);
    /* Boost factor */
    unsigned int keywordsBoostFactor;
    inline unsigned int getTitleBoostFactor(const unsigned int contentLength) {
      return contentLength / 500 + 1;
    }
    /* Verbose */
    pthread_mutex_t verboseMutex;
    bool getVerboseFlag();
    bool verboseFlag;
  private:
    ProgressCallback progressCallback;
    pthread_mutex_t threadIdsMutex;
    /* Article extraction */
    pthread_t articleExtractor;
    pthread_mutex_t articleExtractorRunningMutex;
    static void *extractArticles(void *ptr);
    bool articleExtractorRunningFlag;
    bool isArticleExtractorRunning();
    void articleExtractorRunning(bool value);
    /* Article parsing */
    pthread_t articleParser;
    pthread_mutex_t articleParserRunningMutex;
    static void *parseArticles(void *ptr);
    bool articleParserRunningFlag;
    bool isArticleParserRunning();
    void articleParserRunning(bool value);
    /* Index writting */
    pthread_t articleIndexer;
    pthread_mutex_t articleIndexerRunningMutex;
    static void *indexArticles(void *ptr);
    bool articleIndexerRunningFlag;
    bool isArticleIndexerRunning();
    void articleIndexerRunning(bool value);
    /* To parse queue */
    std::queue<indexerToken> toParseQueue;
    pthread_mutex_t toParseQueueMutex;
    void pushToParseQueue(indexerToken &token);
    bool popFromToParseQueue(indexerToken &token);
    bool isToParseQueueEmpty();
    /* To index queue */
    std::queue<indexerToken> toIndexQueue;
    pthread_mutex_t toIndexQueueMutex;
    void pushToIndexQueue(indexerToken &token);
    bool popFromToIndexQueue(indexerToken &token);
    bool isToIndexQueueEmpty();
    /* Article Count & Progression */
    unsigned int articleCount;
    pthread_mutex_t articleCountMutex;
    void setArticleCount(const unsigned int articleCount);
    unsigned int getArticleCount();
    /* Progression */
    unsigned int progression;
    pthread_mutex_t progressionMutex;
    void setProgression(const unsigned int progression);
    /* getProgression() is public */
    /* ZIM path */
    pthread_mutex_t zimPathMutex;
    string zimPath;
    void setZimPath(const string path);
    string getZimPath();
    /* Index path */
    pthread_mutex_t indexPathMutex;
    string indexPath;
    void setIndexPath(const string path);
    string getIndexPath();
    /* ZIM id */
    pthread_mutex_t zimIdMutex;
    string zimId;
    void setZimId(const string id);
    string getZimId();
  };
 }
 #endif
--- a/include/meson.build
+++ b/include/meson.build
@ -5,12 +5,8 @@ headers = [
  'searcher.h'
 ]
 if not get_option('android')
  headers += ['indexer.h']
 endif
 if xapian_dep.found()
-  headers += ['xapianIndexer.h', 'xapianSearcher.h']
+  headers += ['xapianSearcher.h']
 endif
 install_headers(headers, subdir:'kiwix')
--- a/include/xapianIndexer.h
+++ b/include/xapianIndexer.h
@ -1,56 +0,0 @@
 /*
 * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU  General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301, USA.
 */
 #ifndef KIWIX_XAPIAN_INDEXER_H
 #define KIWIX_XAPIAN_INDEXER_H
 #include <xapian.h>
 #include "indexer.h"
 using namespace std;
 namespace kiwix {
  class XapianIndexer : public Indexer {
  public:
    XapianIndexer();
  protected:
    void indexingPrelude(const string indexPath);
    void index(const string &url, 
 	       const string &title, 
 	       const string &unaccentedTitle,
 	       const string &keywords, 
 	       const string &content,
 	       const string &snippet,
 	       const string &size,
 	       const string &wordCount);
    void flush();
    void indexingPostlude(const string indexPath);
    Xapian::WritableDatabase writableDatabase;
    Xapian::Stem stemmer;
    Xapian::SimpleStopper stopper;
    Xapian::TermGenerator indexer;
  };
 }
 #endif
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@ -1,513 +0,0 @@
 /*
 * Copyright 2011-2014 Emmanuel Engelhart <kelson@kiwix.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU  General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301, USA.
 */
 #include "indexer.h"
 #include "xapian/myhtmlparse.h"
 #include "kiwixlib-resources.h"
 namespace kiwix {
  /* Count word */
  unsigned int Indexer::countWords(const string &text) {
    unsigned int numWords = 1;
    unsigned int length = text.size();
    for(unsigned int i=0; i<length;) {
      while(i<length && text[i] != ' ') {
 	i++;
      }
      numWords++;
      i++;
    }
    return numWords;
  }
  /* Constructor */
  Indexer::Indexer() :
    keywordsBoostFactor(3),
    verboseFlag(false) {
    /* Initialize mutex */
    pthread_mutex_init(&threadIdsMutex, NULL);
    pthread_mutex_init(&toParseQueueMutex, NULL);
    pthread_mutex_init(&toIndexQueueMutex, NULL);
    pthread_mutex_init(&articleExtractorRunningMutex, NULL);
    pthread_mutex_init(&articleParserRunningMutex, NULL);
    pthread_mutex_init(&articleIndexerRunningMutex, NULL);
    pthread_mutex_init(&articleCountMutex, NULL);
    pthread_mutex_init(&zimPathMutex, NULL);
    pthread_mutex_init(&zimIdMutex, NULL);
    pthread_mutex_init(&indexPathMutex, NULL);
    pthread_mutex_init(&progressionMutex, NULL);
    pthread_mutex_init(&verboseMutex, NULL);
  }
  /* Destructor */
  Indexer::~Indexer() {
  }
 #pragma mark - Extractor
  /* Article extractor methods */
  void *Indexer::extractArticles(void *ptr) {
    pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
    kiwix::Indexer *self = (kiwix::Indexer *)ptr;
    /* Get the number of article to index and the ZIM id */
    kiwix::Reader reader(self->getZimPath());
    unsigned int articleCount = reader.getArticleCount();
    self->setArticleCount(articleCount);
    string zimId = reader.getId();
    self->setZimId(zimId);
    /* Progression */
    unsigned int readArticleCount = 0;
    unsigned int currentProgression = 0;
    self->setProgression(currentProgression);
    unsigned int newProgress;
    /* StopWords */
 //    self->readStopWords(reader.getLanguage());
    /* Goes trough all articles */
    zim::File *zimHandler = reader.getZimFileHandler();
    unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');
    unsigned int lastOffset = zimHandler->getNamespaceEndOffset('A');
    zim::Article currentArticle;
    while (currentOffset < lastOffset) {
      currentArticle = zimHandler->getArticle(currentOffset);
      if (!currentArticle.isRedirect()) {
        /* Add articles to the queue */
        indexerToken token;
        token.title = currentArticle.getTitle();
        token.url = currentArticle.getLongUrl();
        token.content = string(currentArticle.getData().data(), currentArticle.getData().size());
        self->pushToParseQueue(token);
        readArticleCount += 1;
        /* Update progress */
        if (self->progressCallback) {
            self->progressCallback(readArticleCount, articleCount);
        }
        newProgress = (unsigned int)((float)readArticleCount / (float)articleCount * 100);
        if (newProgress != currentProgression) {
          self->setProgression(newProgress);
        }
      }
      currentOffset += 1;
      /* Test if the thread should be cancelled */
      pthread_testcancel();
    } 
    self->articleExtractorRunning(false);
    pthread_exit(NULL);
    return NULL;
  }
  void Indexer::articleExtractorRunning(bool value) {
    pthread_mutex_lock(&articleExtractorRunningMutex);
    this->articleExtractorRunningFlag = value;
    pthread_mutex_unlock(&articleExtractorRunningMutex); 
  }
  bool Indexer::isArticleExtractorRunning() {
    pthread_mutex_lock(&articleExtractorRunningMutex);
    bool retVal = this->articleExtractorRunningFlag;
    pthread_mutex_unlock(&articleExtractorRunningMutex); 
    return retVal;
  }
 #pragma mark - Parser
  /* Article parser methods */
  void *Indexer::parseArticles(void *ptr) {
    pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
    kiwix::Indexer *self = (kiwix::Indexer *)ptr;
    size_t found;
    indexerToken token;
    while (self->popFromToParseQueue(token)) {
      MyHtmlParser htmlParser;
      /* The parser generate a lot of exceptions which should be avoided */
      try {
 	htmlParser.parse_html(token.content, "UTF-8", true);
      } catch (...) {
      }
      /* If content does not have the noindex meta tag */
      /* Seems that the parser generates an exception in such case */
      found = htmlParser.dump.find("NOINDEX");
      if (found == string::npos) {
 	/* Get the accented title */
 	token.accentedTitle = (htmlParser.title.empty() ? token.title : htmlParser.title);
 	/* count words */
 	stringstream countWordStringStream;
 	countWordStringStream << self->countWords(htmlParser.dump);
 	token.wordCount = countWordStringStream.str();
 	/* snippet */
 	std::string snippet = std::string(htmlParser.dump, 0, 300);
 	std::string::size_type last = snippet.find_last_of('.');
 	if (last == snippet.npos)
 	  last = snippet.find_last_of(' ');
 	if (last != snippet.npos)
 	  snippet = snippet.substr(0, last);
 	token.snippet = snippet;
 	/* size */
 	stringstream sizeStringStream;
 	sizeStringStream << token.content.size() / 1024;
 	token.size = sizeStringStream.str();
 	/* Remove accent */
 	token.title = kiwix::removeAccents(token.accentedTitle);
 	token.keywords = kiwix::removeAccents(htmlParser.keywords);
 	token.content = kiwix::removeAccents(htmlParser.dump);
 	self->pushToIndexQueue(token);
      }
      /* Test if the thread should be cancelled */
      pthread_testcancel(); 
    }
    self->articleParserRunning(false);
    pthread_exit(NULL);
    return NULL;
  }
  void Indexer::articleParserRunning(bool value) {
    pthread_mutex_lock(&articleParserRunningMutex);
    this->articleParserRunningFlag = value;
    pthread_mutex_unlock(&articleParserRunningMutex); 
  }
  bool Indexer::isArticleParserRunning() {
    pthread_mutex_lock(&articleParserRunningMutex);
    bool retVal = this->articleParserRunningFlag;
    pthread_mutex_unlock(&articleParserRunningMutex); 
    return retVal;
  }
 #pragma mark - Indexer
  /* Article indexer methods */
  void *Indexer::indexArticles(void *ptr) {
    pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
    kiwix::Indexer *self = (kiwix::Indexer *)ptr;
    unsigned int indexedArticleCount = 0;
    indexerToken token;
    self->indexingPrelude(self->getIndexPath()); 
    while (self->popFromToIndexQueue(token)) {
      self->index(token.url, 
 		  token.accentedTitle,
 		  token.title, 
 		  token.keywords,
 		  token.content,
 		  token.snippet,
 		  token.size,
 		  token.wordCount
 		  );
      indexedArticleCount += 1;
      /* Make a hard-disk flush every 10.000 articles */
      if (indexedArticleCount % 5000 == 0) {
 	self->flush();
      }
      /* Test if the thread should be cancelled */
      pthread_testcancel();
    }
    self->indexingPostlude(self->getIndexPath());
    /* Write content id file */
    string path = appendToDirectory(self->getIndexPath(), "content.id");
    writeTextFile(path, self->getZimId());
    self->setProgression(100);
    kiwix::sleep(100);
    self->articleIndexerRunning(false);
    pthread_exit(NULL);
    return NULL;
  }
  void Indexer::articleIndexerRunning(bool value) {
    pthread_mutex_lock(&articleIndexerRunningMutex);
    this->articleIndexerRunningFlag = value;
    pthread_mutex_unlock(&articleIndexerRunningMutex); 
  }
  bool Indexer::isArticleIndexerRunning() {
    pthread_mutex_lock(&articleIndexerRunningMutex);
    bool retVal = this->articleIndexerRunningFlag;
    pthread_mutex_unlock(&articleIndexerRunningMutex); 
    return retVal;
  }
 #pragma mark - Parse Queue
  /* ToParseQueue methods */
  bool Indexer::isToParseQueueEmpty() {
    pthread_mutex_lock(&toParseQueueMutex);
    bool retVal = this->toParseQueue.empty();
    pthread_mutex_unlock(&toParseQueueMutex);
    return retVal;
  }
  void Indexer::pushToParseQueue(indexerToken &token) {
    pthread_mutex_lock(&toParseQueueMutex); 
    this->toParseQueue.push(token);
    pthread_mutex_unlock(&toParseQueueMutex); 
    kiwix::sleep(int(this->toParseQueue.size() / 200) / 10 * 1000);
  }
  bool Indexer::popFromToParseQueue(indexerToken &token) {
    while (this->isToParseQueueEmpty() && this->isArticleExtractorRunning()) {
      kiwix::sleep(500);
      if (this->getVerboseFlag()) {
 	std::cout << "Waiting... ToParseQueue is empty for now..." << std::endl;
      }
      pthread_testcancel();
    }
    if (!this->isToParseQueueEmpty()) {
      pthread_mutex_lock(&toParseQueueMutex); 
      token = this->toParseQueue.front();
      this->toParseQueue.pop();
      pthread_mutex_unlock(&toParseQueueMutex); 
    } else {
      return false;
    }
    return true;
  }
 #pragma mark - Index Queue
  /* ToIndexQueue methods */
  bool Indexer::isToIndexQueueEmpty() {
    pthread_mutex_lock(&toIndexQueueMutex);
    bool retVal = this->toIndexQueue.empty();
    pthread_mutex_unlock(&toIndexQueueMutex);
    return retVal;
  }
  void Indexer::pushToIndexQueue(indexerToken &token) {
    pthread_mutex_lock(&toIndexQueueMutex); 
    this->toIndexQueue.push(token);
    pthread_mutex_unlock(&toIndexQueueMutex);
    kiwix::sleep(int(this->toIndexQueue.size() / 200) / 10 * 1000);
  }
  bool Indexer::popFromToIndexQueue(indexerToken &token) {
    while (this->isToIndexQueueEmpty() && this->isArticleParserRunning()) {
      kiwix::sleep(500);
      if (this->getVerboseFlag()) {
 	std::cout << "Waiting... ToIndexQueue is empty for now..." << std::endl;
      }
      pthread_testcancel();
    }
    if (!this->isToIndexQueueEmpty()) {
      pthread_mutex_lock(&toIndexQueueMutex); 
      token = this->toIndexQueue.front();
      this->toIndexQueue.pop();
      pthread_mutex_unlock(&toIndexQueueMutex); 
    } else {
      return false;
    }
    return true;
  }
 #pragma mark - Properties Getter & Setter
  /* ZIM & Index methods */
  void Indexer::setZimPath(const string path) {
    pthread_mutex_lock(&zimPathMutex);
    this->zimPath = path;
    pthread_mutex_unlock(&zimPathMutex); 
  }
  string Indexer::getZimPath() {
    pthread_mutex_lock(&zimPathMutex); 
    string retVal = this->zimPath;
    pthread_mutex_unlock(&zimPathMutex);
    return retVal;
  }
  void Indexer::setIndexPath(const string path) {
    pthread_mutex_lock(&indexPathMutex); 
    this->indexPath = path;
    pthread_mutex_unlock(&indexPathMutex); 
  }
  string Indexer::getIndexPath() {
    pthread_mutex_lock(&indexPathMutex); 
    string retVal = this->indexPath;
    pthread_mutex_unlock(&indexPathMutex);
    return retVal;
  }
  void Indexer::setArticleCount(const unsigned int articleCount) {
    pthread_mutex_lock(&articleCountMutex);
    this->articleCount = articleCount;
    pthread_mutex_unlock(&articleCountMutex); 
  }
  unsigned int Indexer::getArticleCount() {
    pthread_mutex_lock(&articleCountMutex);
    unsigned int retVal = this->articleCount;
    pthread_mutex_unlock(&articleCountMutex);
    return retVal;
  }
  void Indexer::setProgression(const unsigned int progression) {
    pthread_mutex_lock(&progressionMutex); 
    this->progression = progression;
    pthread_mutex_unlock(&progressionMutex); 
  }
  unsigned int Indexer::getProgression() {
    pthread_mutex_lock(&progressionMutex); 
    unsigned int retVal = this->progression;
    pthread_mutex_unlock(&progressionMutex); 
    return retVal;
  }
  void Indexer::setZimId(const string id) {
    pthread_mutex_lock(&zimIdMutex);
    this->zimId = id;
    pthread_mutex_unlock(&zimIdMutex); 
  }
  string Indexer::getZimId() {
    pthread_mutex_lock(&zimIdMutex); 
    string retVal = this->zimId;
    pthread_mutex_unlock(&zimIdMutex);
    return retVal;
  }
 #pragma mark - Status Management
  /* Manage */
  bool Indexer::start(const string zimPath, const string indexPath, ProgressCallback callback) {
    if (this->getVerboseFlag()) {
      std::cout << "Indexing of '" << zimPath << "' starting..." <<std::endl;
    }
    if (callback) {
        this->progressCallback = callback;
    }
    this->setArticleCount(0);
    this->setProgression(0);
    this->setZimPath(zimPath);
    this->setIndexPath(indexPath);
    pthread_mutex_lock(&threadIdsMutex); 
    this->articleExtractorRunning(true);
    pthread_create(&(this->articleExtractor), NULL, Indexer::extractArticles, (void*)this);
    pthread_detach(this->articleExtractor);
    while(this->isArticleExtractorRunning() && this->getArticleCount() == 0) {
      kiwix::sleep(100);
    }
    this->articleParserRunning(true);
    pthread_create(&(this->articleParser), NULL, Indexer::parseArticles, (void*)this);
    pthread_detach(this->articleParser);
    this->articleIndexerRunning(true);
    pthread_create(&(this->articleIndexer), NULL, Indexer::indexArticles, (void*)this);
    pthread_detach(this->articleIndexer);
    pthread_mutex_unlock(&threadIdsMutex);
    return true;
  }
  bool Indexer::isRunning() {
      if (this->getVerboseFlag()) {
 	std::cout << "isArticleExtractor running: " << (this->isArticleExtractorRunning() ? "yes" : "no") << std::endl;
 	std::cout << "isArticleParser running: " << (this->isArticleParserRunning() ? "yes" : "no") << std::endl;
 	std::cout << "isArticleIndexer running: " << (this->isArticleIndexerRunning() ? "yes" : "no") << std::endl;
      }
    return this->isArticleExtractorRunning() || this->isArticleIndexerRunning() || this->isArticleParserRunning();
  }
  bool Indexer::stop() {
    if (this->isRunning()) {
      bool isArticleExtractorRunning = this->isArticleExtractorRunning();
      bool isArticleIndexerRunning = this->isArticleIndexerRunning();
      bool isArticleParserRunning = this->isArticleParserRunning();
      pthread_mutex_lock(&threadIdsMutex); 
      if (isArticleIndexerRunning) {
 	pthread_cancel(this->articleIndexer);
 	this->articleIndexerRunning(false);
      }
      if (isArticleParserRunning) {
 	pthread_cancel(this->articleParser);
 	this->articleParserRunning(false);
      }
      if (isArticleExtractorRunning) {
 	pthread_cancel(this->articleExtractor);
 	this->articleExtractorRunning(false);
      }
      pthread_mutex_unlock(&threadIdsMutex); 
    }
    return true;
  }
 #pragma mark - verbose
  /* Manage the verboseFlag */
  void Indexer::setVerboseFlag(const bool value) {
    pthread_mutex_lock(&verboseMutex);
    this->verboseFlag = value;
    pthread_mutex_unlock(&verboseMutex);
  }
  bool Indexer::getVerboseFlag() {
    bool value;
    pthread_mutex_lock(&verboseMutex);
    value = this->verboseFlag;
    pthread_mutex_unlock(&verboseMutex);
    return value;
  }
 }
--- a/src/meson.build
+++ b/src/meson.build
@ -16,14 +16,9 @@ kiwix_sources += lib_resources
 if xapian_dep.found()
  kiwix_sources += ['xapianSearcher.cpp']
  if not get_option('android')
    kiwix_sources += ['xapianIndexer.cpp']
  endif
 endif
-if not get_option('android')
+if get_option('android')
  kiwix_sources += ['indexer.cpp']
 else
  subdir('android')
 endif
--- a/src/xapianIndexer.cpp
+++ b/src/xapianIndexer.cpp
@ -1,111 +0,0 @@
 /*
 * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU  General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301, USA.
 */
 #include "xapianIndexer.h"
 namespace kiwix {
  /* Constructor */
  XapianIndexer::XapianIndexer() {
    /*
    stemmer(Xapian::Stem("french")) {
    this->indexer.set_stemmer(this->stemmer);
    */ 
  }
  void XapianIndexer::indexingPrelude(const string indexPath) {
    this->writableDatabase = Xapian::WritableDatabase(indexPath+".tmp", Xapian::DB_CREATE_OR_OVERWRITE | Xapian::DB_BACKEND_GLASS);
    this->writableDatabase.begin_transaction(true);
    /* Insert the stopwords */
    if (!this->stopWords.empty()) {
      std::vector<std::string>::iterator it = this->stopWords.begin();
      for( ; it != this->stopWords.end(); ++it) {
 	this->stopper.add(*it);
      }
      this->indexer.set_stopper(&(this->stopper));
    }
  }
  void XapianIndexer::index(const string &url, 
 			    const string &title, 
 			    const string &unaccentedTitle,
 			    const string &keywords, 
 			    const string &content,
 			    const string &snippet,
 			    const string &size,
 			    const string &wordCount) {
    /* Put the data in the document */
    Xapian::Document currentDocument; 
    currentDocument.clear_values();
    currentDocument.add_value(0, title);
    currentDocument.add_value(1, snippet);
    currentDocument.add_value(2, size);
    currentDocument.add_value(3, wordCount);
    currentDocument.set_data(url);
    indexer.set_document(currentDocument);
    /* Index the title */
    if (!unaccentedTitle.empty()) {
      this->indexer.index_text_without_positions(unaccentedTitle, this->getTitleBoostFactor(content.size()));
    }
    /* Index the keywords */
    if (!keywords.empty()) {
      this->indexer.index_text_without_positions(keywords, keywordsBoostFactor);
    }
    /* Index the content */
    if (!content.empty()) {
      this->indexer.index_text_without_positions(content);
    }
    /* add to the database */
    this->writableDatabase.add_document(currentDocument);
  }
  void XapianIndexer::flush() {
    this->writableDatabase.commit_transaction();
    this->writableDatabase.begin_transaction(true);
  }
  void XapianIndexer::indexingPostlude(const string indexPath) {
    this->flush();
    this->writableDatabase.commit_transaction();
 #ifdef _WIN32
    this->writableDatabase.close();
 #endif
    /* Compacting the index */
    Xapian::Compactor compactor;
    try {
      Xapian::Database src;
      src.add_database(Xapian::Database(indexPath+".tmp"));
      src.compact(indexPath, Xapian::Compactor::FULL | Xapian::DBCOMPACT_SINGLE_FILE, 0, compactor);
    } catch (const Xapian::Error &error) {
      cerr << indexPath << ": " << error.get_description() << endl;
      exit(1);
    } catch (const char * msg) {
      cerr << indexPath << ": " << msg << endl;
      exit(1);
    }
  }
 }