+ factorizing of the kiwix::Indexer class in kiwix::Indexer (abstract) and kiwix::XapianIndexer

2010-10-27 19:23:05 +00:00 · 2010-10-27 19:23:05 +00:00 · ef2423b1a7
parent d52c86bcac
commit ef2423b1a7
4 changed files with 215 additions and 169 deletions
--- a/src/common/kiwix/indexer.cpp
+++ b/src/common/kiwix/indexer.cpp
@ -20,158 +20,9 @@ namespace kiwix {
    : zimFileHandler(NULL), 
      articleCount(0), 
      stepSize(0) {
-
+    
    /* Open the ZIM file */
    this->zimFileHandler = new zim::File(zimFilePath);
-    
-    /* Open the Xapian directory */
-    this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, 
-							  Xapian::DB_CREATE_OR_OVERWRITE);
-
-    /* Stemming */
-    /*
-    stemmer = Xapian::Stem("french");
-    indexer.set_stemmer(stemmer);
-    */
-
-    /* Read the stopwords file */
-    /*
-    this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
-    std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
-    this->stopper.add("ceci");
-    while (stopWordsIterator != this->stopWords.end()) {
-      this->stopper.add(*stopWordsIterator);
-      stopWordsIterator++;
-    }
-    indexer.set_stopper(&(this->stopper));
-    */
-
-    /* Prepare the indexation */
-    this->prepareIndexing();
-  }
-  
-  /* Destructor */
-  Indexer::~Indexer() {
-    this->stopIndexing();
-  }
-  
-  /* Start indexing */
-  void Indexer::prepareIndexing() {
-
-    /* Define a few values */
-    this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
-    this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
-    this->currentArticleOffset = this->firstArticleOffset;
-    
-    /* Compute few things */
-    this->articleCount = this->zimFileHandler->getNamespaceCount('A');
-    this->stepSize = (float)this->articleCount / (float)100;
-  }
-  
-  /* Index next percent */
-  bool Indexer::indexNextPercent(const bool &verbose) {
-    float thresholdOffset = this->currentArticleOffset + this->stepSize;
-    size_t found;
-
-    /* Check if we can start */
-    if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
-      return false;
-    }
-
-    /* Begin the Xapian transation */
-    this->writableDatabase->begin_transaction(true);
-
-    while(this->currentArticleOffset < thresholdOffset && 
-	  this->currentArticleOffset < this->lastArticleOffset) {
-
-      zim::Article currentArticle;
-      Xapian::Document currentDocument;
-      
-      /* Get next non redirect article */
-      do {
-	currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
-      } while (this->currentArticleOffset++ &&
-	       currentArticle.isRedirect() && 
-	       this->currentArticleOffset != this->lastArticleOffset);
-      
-      if (!currentArticle.isRedirect()) {
-	
-	/* Index the content */
-	this->htmlParser.reset();
-	string content (currentArticle.getData().data(), currentArticle.getData().size());
-
-	/* The parser generate a lot of exceptions which should be avoided */
-	try {
-	  this->htmlParser.parse_html(content, "UTF-8", true);
-	} catch (...) {
-	}
-	
-	/* If content does not have the noindex meta tag */
-	/* Seems that the parser generates an exception in such case */
-	found = this->htmlParser.dump.find("NOINDEX");
-	
-	if (found == string::npos) {
-	  
-	  /* Put the data in the document */
-	  currentDocument.clear_values();
-	  currentDocument.add_value(0, this->htmlParser.title);
-	  currentDocument.set_data(currentArticle.getLongUrl().c_str());
-	  indexer.set_document(currentDocument);
-	  
-	  /* Debug output */
-	  if (verbose) {
-	    std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
-	  }
-	  
-	  /* Index the title */
-	  if (!this->htmlParser.title.empty()) {
-	    indexer.index_text_without_positions(removeAccents(this->htmlParser.title), 
-						 ((this->htmlParser.dump.size() / 100) + 1) / 
-						 countWords(this->htmlParser.title) );
-	  }
-	  
-	  /* Index the keywords */
-	  if (!this->htmlParser.keywords.empty()) {
-	    indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3);
-	  }
-	  
-	  /* Index the content */
-	  if (!this->htmlParser.dump.empty()) {
-	    indexer.index_text_without_positions(removeAccents(this->htmlParser.dump));
-	  }
-	  
-	  /* add to the database */
-	  this->writableDatabase->add_document(currentDocument);
-	}
-      }
-    }
-    
-    /* Flush and close Xapian transaction*/
-    this->writableDatabase->commit_transaction();
-
-    /* increment the offset and set returned value */
-    if (this->currentArticleOffset < this->lastArticleOffset) {
-      this->currentArticleOffset++;
-      return true;
-    } else {
-      this->stopIndexing();
-      return false;
-    }
-  }
-  
-  /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
-  void Indexer::stopIndexing() {
-    /* Delete the zimFileHandler */
-    if (this->zimFileHandler != NULL) {
-      delete this->zimFileHandler;
-      this->zimFileHandler = NULL;
-    }
-    
-    /* Delete the Xapian writableDatabase */
-    if (this->writableDatabase != NULL) {
-      delete this->writableDatabase;
-      this->writableDatabase = NULL;
-    }
  }

  /* Read the file containing the stopwords */
--- a/src/common/kiwix/indexer.h
+++ b/src/common/kiwix/indexer.h
@ -21,34 +21,30 @@ namespace kiwix {
    
  public:
    Indexer(const string &zimFilePath, const string &xapianDirectoryPath);
-    ~Indexer();
-    
-    bool indexNextPercent(const bool &verbose = false);
+    virtual bool indexNextPercent(const bool &verbose = false) = 0;
    
  protected:
-    void prepareIndexing();
-    void stopIndexing();
-    unsigned int countWords(const string &text);
-
-    bool readStopWordsFile(const string path);
-
-    unsigned int articleCount;
-    float stepSize;
-
+    virtual void prepareIndexing() = 0;
+    virtual void stopIndexing() = 0;
+    
+    /* ZIM file handling */
    zim::File* zimFileHandler;
    zim::size_type firstArticleOffset;
    zim::size_type lastArticleOffset;
    zim::size_type currentArticleOffset;
    
-    Xapian::WritableDatabase *writableDatabase;
-    Xapian::Stem stemmer;
-    Xapian::SimpleStopper stopper;
-    Xapian::TermGenerator indexer;
-    
-    std::vector<std::string> stopWords;
+    /* HTML parsing */
    MyHtmlParser htmlParser;
-  };
+    unsigned int countWords(const string &text);

+    /* Stopwords */
+    bool readStopWordsFile(const string path);
+    std::vector<std::string> stopWords;
+
+    /* Others */
+    unsigned int articleCount;
+    float stepSize;
+  };
 }

 #endif
--- a/src/common/kiwix/xapianIndexer.cpp
+++ b/src/common/kiwix/xapianIndexer.cpp
@ -0,0 +1,158 @@
+#include "xapianIndexer.h"
+
+namespace kiwix {
+
+  /* Constructor */
+  XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) :
+    Indexer(zimFilePath, xapianDirectoryPath) {
+    
+    /* Open the Xapian directory */
+    this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, 
+							  Xapian::DB_CREATE_OR_OVERWRITE);
+
+    /* Stemming */
+    /*
+    stemmer = Xapian::Stem("french");
+    indexer.set_stemmer(stemmer);
+    */
+
+    /* Read the stopwords file */
+    /*
+    this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
+    std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
+    this->stopper.add("ceci");
+    while (stopWordsIterator != this->stopWords.end()) {
+      this->stopper.add(*stopWordsIterator);
+      stopWordsIterator++;
+    }
+    indexer.set_stopper(&(this->stopper));
+    */
+
+    /* Prepare the indexation */
+    this->prepareIndexing();
+  }
+  
+  /* Destructor */
+  XapianIndexer::~XapianIndexer() {
+    this->stopIndexing();
+  }
+  
+  /* Start indexing */
+  void XapianIndexer::prepareIndexing() {
+
+    /* Define a few values */
+    this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
+    this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
+    this->currentArticleOffset = this->firstArticleOffset;
+    
+    /* Compute few things */
+    this->articleCount = this->zimFileHandler->getNamespaceCount('A');
+    this->stepSize = (float)this->articleCount / (float)100;
+  }
+  
+  /* Index next percent */
+  bool XapianIndexer::indexNextPercent(const bool &verbose) {
+    float thresholdOffset = this->currentArticleOffset + this->stepSize;
+    size_t found;
+
+    /* Check if we can start */
+    if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
+      return false;
+    }
+
+    /* Begin the Xapian transation */
+    this->writableDatabase->begin_transaction(true);
+
+    while(this->currentArticleOffset < thresholdOffset && 
+	  this->currentArticleOffset < this->lastArticleOffset) {
+
+      zim::Article currentArticle;
+      Xapian::Document currentDocument;
+      
+      /* Get next non redirect article */
+      do {
+	currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
+      } while (this->currentArticleOffset++ &&
+	       currentArticle.isRedirect() && 
+	       this->currentArticleOffset != this->lastArticleOffset);
+      
+      if (!currentArticle.isRedirect()) {
+	
+	/* Index the content */
+	this->htmlParser.reset();
+	string content (currentArticle.getData().data(), currentArticle.getData().size());
+
+	/* The parser generate a lot of exceptions which should be avoided */
+	try {
+	  this->htmlParser.parse_html(content, "UTF-8", true);
+	} catch (...) {
+	}
+	
+	/* If content does not have the noindex meta tag */
+	/* Seems that the parser generates an exception in such case */
+	found = this->htmlParser.dump.find("NOINDEX");
+	
+	if (found == string::npos) {
+	  
+	  /* Put the data in the document */
+	  currentDocument.clear_values();
+	  currentDocument.add_value(0, this->htmlParser.title);
+	  currentDocument.set_data(currentArticle.getLongUrl().c_str());
+	  indexer.set_document(currentDocument);
+	  
+	  /* Debug output */
+	  if (verbose) {
+	    std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
+	  }
+	  
+	  /* Index the title */
+	  if (!this->htmlParser.title.empty()) {
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.title), 
+						 ((this->htmlParser.dump.size() / 100) + 1) / 
+						 countWords(this->htmlParser.title) );
+	  }
+	  
+	  /* Index the keywords */
+	  if (!this->htmlParser.keywords.empty()) {
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3);
+	  }
+	  
+	  /* Index the content */
+	  if (!this->htmlParser.dump.empty()) {
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.dump));
+	  }
+	  
+	  /* add to the database */
+	  this->writableDatabase->add_document(currentDocument);
+	}
+      }
+    }
+    
+    /* Flush and close Xapian transaction*/
+    this->writableDatabase->commit_transaction();
+
+    /* increment the offset and set returned value */
+    if (this->currentArticleOffset < this->lastArticleOffset) {
+      this->currentArticleOffset++;
+      return true;
+    } else {
+      this->stopIndexing();
+      return false;
+    }
+  }
+  
+  /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
+  void XapianIndexer::stopIndexing() {
+    /* Delete the zimFileHandler */
+    if (this->zimFileHandler != NULL) {
+      delete this->zimFileHandler;
+      this->zimFileHandler = NULL;
+    }
+    
+    /* Delete the Xapian writableDatabase */
+    if (this->writableDatabase != NULL) {
+      delete this->writableDatabase;
+      this->writableDatabase = NULL;
+    }
+  }
+}
--- a/src/common/kiwix/xapianIndexer.h
+++ b/src/common/kiwix/xapianIndexer.h
@ -0,0 +1,41 @@
+#ifndef KIWIX_XAPIAN_INDEXER_H
+#define KIWIX_XAPIAN_INDEXER_H
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <iostream>
+
+#include <xapian.h>
+#include <unaccent.h>
+#include <zim/file.h>
+#include <zim/article.h>
+#include <zim/fileiterator.h>
+#include "xapian/myhtmlparse.h"
+#include "indexer.h"
+
+using namespace std;
+
+namespace kiwix {
+  
+  class XapianIndexer : public Indexer {
+    
+  public:
+    XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath);
+    ~XapianIndexer();
+    
+    bool indexNextPercent(const bool &verbose = false);
+    
+  protected:
+    void prepareIndexing();
+    void stopIndexing();
+    
+    Xapian::WritableDatabase *writableDatabase;
+    Xapian::Stem stemmer;
+    Xapian::SimpleStopper stopper;
+    Xapian::TermGenerator indexer;
+  };
+
+}
+
+#endif