+ factorization of the indexing code

2009-12-17 21:48:16 +00:00 · 2009-12-17 21:48:16 +00:00 · 4464e611a9
parent c19ad61051
commit 4464e611a9
2 changed files with 189 additions and 0 deletions
--- a/src/common/kiwix/indexer.cpp
+++ b/src/common/kiwix/indexer.cpp
@ -0,0 +1,146 @@
+#include "indexer.h"
+
+namespace kiwix {
+
+  /* Count word */
+  unsigned int countWords(const string &text) {
+    unsigned int numWords = 1;
+    for(int i=0; i<text.size();) {
+      while(i<text.size() && text[i] != ' ') {
+	i++;
+      }
+      numWords++;
+      i++;
+    }
+    return numWords;
+  }
+
+  /* Constructor */
+  Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath) 
+    : zimFileHandler(NULL), 
+      stemmer(Xapian::Stem("english")),
+      articleCount(0), 
+      stepSize(0) {
+
+    /* Open the ZIM file */
+    this->zimFileHandler = new zim::File(zimFilePath);
+    
+    if (this->zimFileHandler != NULL) {
+      this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
+      this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
+      this->currentArticleOffset = this->firstArticleOffset;
+    } else {
+      throw("Unable to open " + zimFilePath);
+    }
+
+    /* Open the Xapian directory */
+    this->writableDatabase = Xapian::WritableDatabase(xapianDirectoryPath, 
+						      Xapian::DB_CREATE_OR_OVERWRITE);
+  }
+  
+  /* Destructor */
+  Indexer::~Indexer() {
+
+    /* delete the zimFileHandler */
+    if (this->zimFileHandler != NULL) {
+      delete this->zimFileHandler;
+    }
+
+    /* delte the Xapian writableDatabase */
+    this->writableDatabase.~WritableDatabase();
+  }
+  
+  /* Start indexing */
+  void Indexer::startIndexing() {
+    
+    /* Compute few things */
+    this->articleCount = this->zimFileHandler->getNamespaceCount('A');
+    this->stepSize = (float)this->articleCount / (float)100;
+  }
+  
+  /* Index next percent */
+  bool Indexer::indexNextPercent() {
+    float thresholdOffset = this->currentArticleOffset + this->stepSize;
+    size_t found;
+    
+    while(this->currentArticleOffset < thresholdOffset && 
+	  this->currentArticleOffset < this->lastArticleOffset) {
+      
+      /* get next non redirect article */
+      do {
+	currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
+      } while (this->currentArticleOffset++ &&
+	       currentArticle.isRedirect() && 
+	       this->currentArticleOffset != this->lastArticleOffset);
+      
+      if (!currentArticle.isRedirect()) {
+	/* Index the content */
+	this->htmlParser.reset();
+	string content (currentArticle.getData().data(), currentArticle.getData().size());
+
+	try {
+	  this->htmlParser.parse_html(content, "UTF-8", true);
+	} catch (...) {
+	}
+	
+	/* if content does not have the noindex meta tag */
+	found=this->htmlParser.dump.find("NOINDEX");
+	
+	if (found == string::npos) {
+	  
+	  /* Set the stemmer */
+	  /* TODO, autodetect the language */
+	  //indexer.set_stemmer(stemmer);
+	  
+	  /* Put the data in the document */
+	  Xapian::Document document;
+	  document.add_value(0, this->htmlParser.title);
+	  document.set_data(currentArticle.getUrl().getValue().c_str());
+	  indexer.set_document(document);
+	  
+	  /* Debug output */
+	  std::cout << "Indexing " << currentArticle.getUrl().getValue() << "..." << std::endl;
+	  
+	  /* Index the title */
+	  if (!this->htmlParser.title.empty()) {
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.title.c_str()), 
+						 ((this->htmlParser.dump.size() / 100) + 1) / 
+						 countWords(this->htmlParser.title) );
+	  }
+	  
+	  /* Index the keywords */
+	  if (!this->htmlParser.keywords.empty()) {
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords.c_str()), 3);
+	  }
+	  
+	  /* Index the content */
+	  if (!this->htmlParser.dump.empty()) {
+	    indexer.index_text_without_positions(removeAccents(this->htmlParser.dump.c_str()));
+	  }
+	  
+	  /* add to the database */
+	  this->writableDatabase.add_document(document);
+	}
+      }
+    }
+    
+    /* Write Xapian DB to the disk */
+    this->writableDatabase.flush();
+
+    /* increment the offset and set returned value */
+    if (this->currentArticleOffset < this->lastArticleOffset) {
+      this->currentArticleOffset++;
+      return true;
+    } else {
+      this->stopIndexing();
+      return false;
+    }
+  }
+  
+  /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
+  void Indexer::stopIndexing() {
+    this->currentArticleOffset = this->firstArticleOffset;
+    this->writableDatabase.~WritableDatabase();
+  }
+  
+}
--- a/src/common/kiwix/indexer.h
+++ b/src/common/kiwix/indexer.h
@ -0,0 +1,43 @@
+#ifndef KIWIX_INDEXER_H
+#define KIWIX_INDEXER_H
+
+#include <xapian.h>
+#include <unaccent.h>
+#include <zim/file.h>
+#include <zim/article.h>
+#include <zim/fileiterator.h>
+#include "xapian/myhtmlparse.h"
+
+using namespace std;
+
+namespace kiwix {
+
+  class Indexer {
+    
+  public:
+    Indexer(const string &zimFilePath, const string &xapianDirectoryPath);
+    ~Indexer();    
+    
+    void startIndexing();
+    bool indexNextPercent();
+    void stopIndexing();
+
+  protected:
+    zim::File* zimFileHandler;
+    zim::size_type firstArticleOffset;
+    zim::size_type lastArticleOffset;
+    zim::size_type currentArticleOffset;
+    zim::Article currentArticle;
+    
+    unsigned int articleCount;
+    float stepSize;
+    
+    Xapian::WritableDatabase writableDatabase;
+    Xapian::Stem stemmer;
+    Xapian::TermGenerator indexer;
+    MyHtmlParser htmlParser;
+  };
+
+}
+
+#endif