+ factorizing of the kiwix::Indexer class in kiwix::Indexer (abstract) and kiwix::XapianIndexer

2010-10-27 19:23:05 +00:00 · 2010-10-27 19:23:05 +00:00 · ef2423b1a7
parent d52c86bcac
commit ef2423b1a7
4 changed files with 215 additions and 169 deletions
--- a/src/common/kiwix/indexer.cpp
+++ b/src/common/kiwix/indexer.cpp
@ -23,155 +23,6 @@ namespace kiwix {
    /* Open the ZIM file */
    this->zimFileHandler = new zim::File(zimFilePath);
    /* Open the Xapian directory */
    this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, 
 							  Xapian::DB_CREATE_OR_OVERWRITE);
    /* Stemming */
    /*
    stemmer = Xapian::Stem("french");
    indexer.set_stemmer(stemmer);
    */
    /* Read the stopwords file */
    /*
    this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
    std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
    this->stopper.add("ceci");
    while (stopWordsIterator != this->stopWords.end()) {
      this->stopper.add(*stopWordsIterator);
      stopWordsIterator++;
    }
    indexer.set_stopper(&(this->stopper));
    */
    /* Prepare the indexation */
    this->prepareIndexing();
  }
  /* Destructor */
  Indexer::~Indexer() {
    this->stopIndexing();
  }
  /* Start indexing */
  void Indexer::prepareIndexing() {
    /* Define a few values */
    this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
    this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
    this->currentArticleOffset = this->firstArticleOffset;
    /* Compute few things */
    this->articleCount = this->zimFileHandler->getNamespaceCount('A');
    this->stepSize = (float)this->articleCount / (float)100;
  }
  /* Index next percent */
  bool Indexer::indexNextPercent(const bool &verbose) {
    float thresholdOffset = this->currentArticleOffset + this->stepSize;
    size_t found;
    /* Check if we can start */
    if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
      return false;
    }
    /* Begin the Xapian transation */
    this->writableDatabase->begin_transaction(true);
    while(this->currentArticleOffset < thresholdOffset && 
 	  this->currentArticleOffset < this->lastArticleOffset) {
      zim::Article currentArticle;
      Xapian::Document currentDocument;
      /* Get next non redirect article */
      do {
 	currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
      } while (this->currentArticleOffset++ &&
 	       currentArticle.isRedirect() && 
 	       this->currentArticleOffset != this->lastArticleOffset);
      if (!currentArticle.isRedirect()) {
 	/* Index the content */
 	this->htmlParser.reset();
 	string content (currentArticle.getData().data(), currentArticle.getData().size());
 	/* The parser generate a lot of exceptions which should be avoided */
 	try {
 	  this->htmlParser.parse_html(content, "UTF-8", true);
 	} catch (...) {
 	}
 	/* If content does not have the noindex meta tag */
 	/* Seems that the parser generates an exception in such case */
 	found = this->htmlParser.dump.find("NOINDEX");
 	if (found == string::npos) {
 	  /* Put the data in the document */
 	  currentDocument.clear_values();
 	  currentDocument.add_value(0, this->htmlParser.title);
 	  currentDocument.set_data(currentArticle.getLongUrl().c_str());
 	  indexer.set_document(currentDocument);
 	  /* Debug output */
 	  if (verbose) {
 	    std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
 	  }
 	  /* Index the title */
 	  if (!this->htmlParser.title.empty()) {
 	    indexer.index_text_without_positions(removeAccents(this->htmlParser.title), 
 						 ((this->htmlParser.dump.size() / 100) + 1) / 
 						 countWords(this->htmlParser.title) );
 	  }
 	  /* Index the keywords */
 	  if (!this->htmlParser.keywords.empty()) {
 	    indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3);
 	  }
 	  /* Index the content */
 	  if (!this->htmlParser.dump.empty()) {
 	    indexer.index_text_without_positions(removeAccents(this->htmlParser.dump));
 	  }
 	  /* add to the database */
 	  this->writableDatabase->add_document(currentDocument);
 	}
      }
    }
    /* Flush and close Xapian transaction*/
    this->writableDatabase->commit_transaction();
    /* increment the offset and set returned value */
    if (this->currentArticleOffset < this->lastArticleOffset) {
      this->currentArticleOffset++;
      return true;
    } else {
      this->stopIndexing();
      return false;
    }
  }
  /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
  void Indexer::stopIndexing() {
    /* Delete the zimFileHandler */
    if (this->zimFileHandler != NULL) {
      delete this->zimFileHandler;
      this->zimFileHandler = NULL;
    }
    /* Delete the Xapian writableDatabase */
    if (this->writableDatabase != NULL) {
      delete this->writableDatabase;
      this->writableDatabase = NULL;
    }
  }
  /* Read the file containing the stopwords */
--- a/src/common/kiwix/indexer.h
+++ b/src/common/kiwix/indexer.h
@ -21,34 +21,30 @@ namespace kiwix {
  public:
    Indexer(const string &zimFilePath, const string &xapianDirectoryPath);
-    ~Indexer();
+    virtual bool indexNextPercent(const bool &verbose = false) = 0;
    bool indexNextPercent(const bool &verbose = false);
  protected:
-    void prepareIndexing();
+    virtual void prepareIndexing() = 0;
-    void stopIndexing();
+    virtual void stopIndexing() = 0;
    unsigned int countWords(const string &text);
    bool readStopWordsFile(const string path);
    unsigned int articleCount;
    float stepSize;
    /* ZIM file handling */
    zim::File* zimFileHandler;
    zim::size_type firstArticleOffset;
    zim::size_type lastArticleOffset;
    zim::size_type currentArticleOffset;
-    Xapian::WritableDatabase *writableDatabase;
+    /* HTML parsing */
    Xapian::Stem stemmer;
    Xapian::SimpleStopper stopper;
    Xapian::TermGenerator indexer;
    std::vector<std::string> stopWords;
    MyHtmlParser htmlParser;
-  };
+    unsigned int countWords(const string &text);
    /* Stopwords */
    bool readStopWordsFile(const string path);
    std::vector<std::string> stopWords;
    /* Others */
    unsigned int articleCount;
    float stepSize;
  };
 }
 #endif
--- a/src/common/kiwix/xapianIndexer.cpp
+++ b/src/common/kiwix/xapianIndexer.cpp
@ -0,0 +1,158 @@
 #include "xapianIndexer.h"
 namespace kiwix {
  /* Constructor */
  XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) :
    Indexer(zimFilePath, xapianDirectoryPath) {
    /* Open the Xapian directory */
    this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, 
 							  Xapian::DB_CREATE_OR_OVERWRITE);
    /* Stemming */
    /*
    stemmer = Xapian::Stem("french");
    indexer.set_stemmer(stemmer);
    */
    /* Read the stopwords file */
    /*
    this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
    std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
    this->stopper.add("ceci");
    while (stopWordsIterator != this->stopWords.end()) {
      this->stopper.add(*stopWordsIterator);
      stopWordsIterator++;
    }
    indexer.set_stopper(&(this->stopper));
    */
    /* Prepare the indexation */
    this->prepareIndexing();
  }
  /* Destructor */
  XapianIndexer::~XapianIndexer() {
    this->stopIndexing();
  }
  /* Start indexing */
  void XapianIndexer::prepareIndexing() {
    /* Define a few values */
    this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
    this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
    this->currentArticleOffset = this->firstArticleOffset;
    /* Compute few things */
    this->articleCount = this->zimFileHandler->getNamespaceCount('A');
    this->stepSize = (float)this->articleCount / (float)100;
  }
  /* Index next percent */
  bool XapianIndexer::indexNextPercent(const bool &verbose) {
    float thresholdOffset = this->currentArticleOffset + this->stepSize;
    size_t found;
    /* Check if we can start */
    if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
      return false;
    }
    /* Begin the Xapian transation */
    this->writableDatabase->begin_transaction(true);
    while(this->currentArticleOffset < thresholdOffset && 
 	  this->currentArticleOffset < this->lastArticleOffset) {
      zim::Article currentArticle;
      Xapian::Document currentDocument;
      /* Get next non redirect article */
      do {
 	currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
      } while (this->currentArticleOffset++ &&
 	       currentArticle.isRedirect() && 
 	       this->currentArticleOffset != this->lastArticleOffset);
      if (!currentArticle.isRedirect()) {
 	/* Index the content */
 	this->htmlParser.reset();
 	string content (currentArticle.getData().data(), currentArticle.getData().size());
 	/* The parser generate a lot of exceptions which should be avoided */
 	try {
 	  this->htmlParser.parse_html(content, "UTF-8", true);
 	} catch (...) {
 	}
 	/* If content does not have the noindex meta tag */
 	/* Seems that the parser generates an exception in such case */
 	found = this->htmlParser.dump.find("NOINDEX");
 	if (found == string::npos) {
 	  /* Put the data in the document */
 	  currentDocument.clear_values();
 	  currentDocument.add_value(0, this->htmlParser.title);
 	  currentDocument.set_data(currentArticle.getLongUrl().c_str());
 	  indexer.set_document(currentDocument);
 	  /* Debug output */
 	  if (verbose) {
 	    std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
 	  }
 	  /* Index the title */
 	  if (!this->htmlParser.title.empty()) {
 	    indexer.index_text_without_positions(removeAccents(this->htmlParser.title), 
 						 ((this->htmlParser.dump.size() / 100) + 1) / 
 						 countWords(this->htmlParser.title) );
 	  }
 	  /* Index the keywords */
 	  if (!this->htmlParser.keywords.empty()) {
 	    indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3);
 	  }
 	  /* Index the content */
 	  if (!this->htmlParser.dump.empty()) {
 	    indexer.index_text_without_positions(removeAccents(this->htmlParser.dump));
 	  }
 	  /* add to the database */
 	  this->writableDatabase->add_document(currentDocument);
 	}
      }
    }
    /* Flush and close Xapian transaction*/
    this->writableDatabase->commit_transaction();
    /* increment the offset and set returned value */
    if (this->currentArticleOffset < this->lastArticleOffset) {
      this->currentArticleOffset++;
      return true;
    } else {
      this->stopIndexing();
      return false;
    }
  }
  /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
  void XapianIndexer::stopIndexing() {
    /* Delete the zimFileHandler */
    if (this->zimFileHandler != NULL) {
      delete this->zimFileHandler;
      this->zimFileHandler = NULL;
    }
    /* Delete the Xapian writableDatabase */
    if (this->writableDatabase != NULL) {
      delete this->writableDatabase;
      this->writableDatabase = NULL;
    }
  }
 }
--- a/src/common/kiwix/xapianIndexer.h
+++ b/src/common/kiwix/xapianIndexer.h
@ -0,0 +1,41 @@
 #ifndef KIWIX_XAPIAN_INDEXER_H
 #define KIWIX_XAPIAN_INDEXER_H
 #include <string>
 #include <vector>
 #include <fstream>
 #include <iostream>
 #include <xapian.h>
 #include <unaccent.h>
 #include <zim/file.h>
 #include <zim/article.h>
 #include <zim/fileiterator.h>
 #include "xapian/myhtmlparse.h"
 #include "indexer.h"
 using namespace std;
 namespace kiwix {
  class XapianIndexer : public Indexer {
  public:
    XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath);
    ~XapianIndexer();
    bool indexNextPercent(const bool &verbose = false);
  protected:
    void prepareIndexing();
    void stopIndexing();
    Xapian::WritableDatabase *writableDatabase;
    Xapian::Stem stemmer;
    Xapian::SimpleStopper stopper;
    Xapian::TermGenerator indexer;
  };
 }
 #endif