+ new indexer code

2012-04-06 21:48:43 +00:00 · 2012-04-06 21:48:43 +00:00 · 62daa9ffe5
parent 735d9afd3a
commit 62daa9ffe5
6 changed files with 169 additions and 252 deletions
--- a/src/common/kiwix/indexer.cpp
+++ b/src/common/kiwix/indexer.cpp
@ -35,20 +35,9 @@ namespace kiwix {
  }

  /* Constructor */
-  Indexer::Indexer(const string &zimFilePath) 
-    : zimFileHandler(NULL), 
-      articleCount(0), 
-      stepSize(0),
-      keywordsBoostFactor(3) {
-   
-    this->initialize();
-    this->setZimFilePath(zimFilePath);
+  Indexer::Indexer() :
+    keywordsBoostFactor(3) {

-    /* Read the stopwords file */
-    //this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
-  }
-
-  void Indexer::initialize() {
    /* Initialize mutex */
    pthread_mutex_init(&threadIdsMutex, NULL);
    pthread_mutex_init(&toParseQueueMutex, NULL);
@ -57,44 +46,34 @@ namespace kiwix {
    pthread_mutex_init(&articleParserRunningMutex, NULL);
    pthread_mutex_init(&articleIndexerRunningMutex, NULL);
    pthread_mutex_init(&articleCountMutex, NULL);
+    pthread_mutex_init(&zimPathMutex, NULL);
+    pthread_mutex_init(&indexPathMutex, NULL);
    pthread_mutex_init(&progressionMutex, NULL);
    
-    /* Article count & Progression */
-    this->setProgression(0);
-  }
-
-  bool Indexer::setZimFilePath(const string &zimFilePath) {
-    /* Open the ZIM file */
-    this->zimFileHandler = new zim::File(zimFilePath);
-
-    /* Define a few values */
-    this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
-    this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
-    this->currentArticleOffset = this->firstArticleOffset;
-    
-    /* Compute few things */
-    kiwix::Reader reader(zimFilePath);
-    this->setArticleCount(reader.getArticleCount());
-    //this->articleCount = this->zimFileHandler->getNamespaceCount('A');
-    this->stepSize = (float)this->articleCount / (float)100;
+    /* Read the stopwords file */
+    //this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
  }

  /* Article extractor methods */
  void *Indexer::extractArticles(void *ptr) {
    kiwix::Indexer *self = (kiwix::Indexer *)ptr;
    self->articleExtractorRunning(true);
-    unsigned int startOffset = self->zimFileHandler->getNamespaceBeginOffset('A');
-    unsigned int endOffset = self->zimFileHandler->getNamespaceEndOffset('A');
+
+    /* Get the number of article to index */
+    kiwix::Reader reader(self->getZimPath());
+    self->setArticleCount(reader.getArticleCount());

    /* Goes trough all articles */
-    unsigned int currentOffset = startOffset;
+    zim::File *zimHandler = reader.getZimFileHandler();
+    unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');;
+    unsigned int lastOffset = zimHandler->getNamespaceEndOffset('A');;
    zim::Article currentArticle;

-    while (currentOffset <= endOffset) {
+    while (currentOffset <= lastOffset) {
      /* Redirects are not indexed */
      do {
-	currentArticle = self->zimFileHandler->getArticle(currentOffset++);
-      } while (currentArticle.isRedirect() && currentOffset != endOffset);
+	currentArticle = zimHandler->getArticle(currentOffset++);
+      } while (currentArticle.isRedirect() && currentOffset != lastOffset);

      /* Add articles to the queue */
      indexerToken token;
@ -176,7 +155,8 @@ namespace kiwix {
 	self->pushToIndexQueue(token);

 	/* Test if the thread should be cancelled */
-	pthread_testcancel();      }
+	pthread_testcancel();     
+      }
    }
    
    self->articleParserRunning(false);
@ -201,29 +181,36 @@ namespace kiwix {
  void *Indexer::indexArticles(void *ptr) {
    kiwix::Indexer *self = (kiwix::Indexer *)ptr;
    self->articleIndexerRunning(true);
+
    indexerToken token;
-    unsigned int stepSize = ((self->getArticleCount() / 100) < 1 ? 1 : (self->getArticleCount() / 100));
    unsigned indexedArticleCount = 0;
+    unsigned int stepSize = ((self->getArticleCount() / 100) < 1 ? 1 : (self->getArticleCount() / 100));
+    self->indexingPrelude(self->getIndexPath()); 

    while (self->popFromToIndexQueue(token)) {
-      self->indexNextArticle(token.url, 
-			     token.accentedTitle,
-			     token.title, 
-			     token.keywords,
-			     token.content,
-			     token.snippet,
-			     token.size,
-			     token.wordCount
-			     );
-
+      self->index(token.url, 
+		  token.accentedTitle,
+		  token.title, 
+		  token.keywords,
+		  token.content,
+		  token.snippet,
+		  token.size,
+		  token.wordCount
+		  );
+      
      if (++indexedArticleCount % stepSize == 0) {
 	self->setProgression(self->getProgression() + 1);
      }
+
+      if (indexedArticleCount % 10000 == 0) {
+	self->flush();
+      }
+
+      /* Test if the thread should be cancelled */
+      pthread_testcancel();
    }
-
    self->setProgression(100);
-    self->indexNextPercentPost();
-
+    self->indexingPostlude();
    self->articleIndexerRunning(false);
    pthread_exit(NULL);
    return NULL;
@ -306,8 +293,34 @@ namespace kiwix {
    return true;
  }

-  /* Article Count & Progression */
-  void Indexer::setArticleCount(unsigned int articleCount) {
+  /* ZIM & Index methods */
+  void Indexer::setZimPath(const string path) {
+    pthread_mutex_lock(&zimPathMutex); 
+    this->zimPath = path;
+    pthread_mutex_unlock(&zimPathMutex); 
+  }
+
+  string Indexer::getZimPath() {
+    pthread_mutex_lock(&zimPathMutex); 
+    string retVal = this->zimPath;
+    pthread_mutex_unlock(&zimPathMutex);
+    return retVal;
+  }
+
+  void Indexer::setIndexPath(const string path) {
+    pthread_mutex_lock(&indexPathMutex); 
+    this->indexPath = path;
+    pthread_mutex_unlock(&indexPathMutex); 
+  }
+
+  string Indexer::getIndexPath() {
+    pthread_mutex_lock(&indexPathMutex); 
+    string retVal = this->indexPath;
+    pthread_mutex_unlock(&indexPathMutex);
+    return retVal;
+  }
+
+  void Indexer::setArticleCount(const unsigned int articleCount) {
    pthread_mutex_lock(&articleCountMutex); 
    this->articleCount = articleCount;
    pthread_mutex_unlock(&articleCountMutex); 
@ -320,7 +333,7 @@ namespace kiwix {
    return retVal;
  }

-  void Indexer::setProgression(unsigned int progression) {
+  void Indexer::setProgression(const unsigned int progression) {
    pthread_mutex_lock(&progressionMutex); 
    this->progression = progression;
    pthread_mutex_unlock(&progressionMutex); 
@ -333,8 +346,12 @@ namespace kiwix {
    return retVal;
  }

-  bool Indexer::start() {
-    this->indexNextPercentPre();
+  /* Manage */
+  bool Indexer::start(const string &zimPath, const string &indexPath) {
+    this->setProgression(0);
+    this->setZimPath(zimPath);
+    this->setIndexPath(indexPath);
+    
    pthread_mutex_lock(&threadIdsMutex); 
    pthread_create(&(this->articleExtractor), NULL, Indexer::extractArticles, (void*)this);
    pthread_detach(this->articleExtractor);
@ -343,15 +360,7 @@ namespace kiwix {
    pthread_create(&(this->articleIndexer), NULL, Indexer::indexArticles, (void*)this);
    pthread_detach(this->articleIndexer);
    pthread_mutex_unlock(&threadIdsMutex);
-    return true;
-  }

-  bool Indexer::stop() {
-    pthread_mutex_lock(&threadIdsMutex); 
-    pthread_cancel(this->articleExtractor);
-    pthread_cancel(this->articleParser);
-    pthread_cancel(this->articleIndexer);
-    pthread_mutex_unlock(&threadIdsMutex); 
    return true;
  }

@ -359,12 +368,27 @@ namespace kiwix {
    return this->isArticleExtractorRunning() || this->isArticleIndexerRunning() || this->isArticleParserRunning();
  }

-  void Indexer::setCurrentArticleOffset(unsigned int offset) {
-    this->currentArticleOffset = offset;
-  }
+  bool Indexer::stop() {
+    if (this->isRunning()) {
+      bool isArticleExtractorRunning = this->isArticleExtractorRunning();
+      bool isArticleIndexerRunning = this->isArticleIndexerRunning();
+      bool isArticleParserRunning = this->isArticleParserRunning();
+      
+      pthread_mutex_lock(&threadIdsMutex); 
+      
+      if (isArticleExtractorRunning)
+	pthread_cancel(this->articleExtractor);
+      if (isArticleIndexerRunning)
+	pthread_cancel(this->articleParser);
+      if (isArticleParserRunning)
+	pthread_cancel(this->articleIndexer);
+      
+      pthread_mutex_unlock(&threadIdsMutex); 
+      
+      this->articleIndexerRunning(false);
+    }

-  unsigned int Indexer::getCurrentArticleOffset() {
-    return this->currentArticleOffset;
+    return true;
  }

  /* Read the file containing the stopwords */
@ -382,102 +406,4 @@ namespace kiwix {
    return true;
  }

-  /* Index next percent */
-  bool Indexer::indexNextPercent(const bool &verbose) {
-    float thresholdOffset = this->currentArticleOffset + this->stepSize;
-    size_t found;
-
-    /* Check if we can start */
-    if (this->zimFileHandler == NULL) {
-      return false;
-    }
-
-    this->indexNextPercentPre();
-
-    while(this->currentArticleOffset < thresholdOffset && 
-	  this->currentArticleOffset <= this->lastArticleOffset) {
-
-      zim::Article currentArticle;
-      
-      /* Get next non redirect article */
-      do {
-	currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
-      } while (this->currentArticleOffset++ &&
-	       currentArticle.isRedirect() && 
-	       this->currentArticleOffset != this->lastArticleOffset);
-
-      if (!currentArticle.isRedirect()) {
-	
-	/* Index the content */
-	this->htmlParser.reset();
-	string content (currentArticle.getData().data(), currentArticle.getData().size());
-
-	/* The parser generate a lot of exceptions which should be avoided */
-	try {
-	  this->htmlParser.parse_html(content, "UTF-8", true);
-	} catch (...) {
-	}
-	
-	/* If content does not have the noindex meta tag */
-	/* Seems that the parser generates an exception in such case */
-	found = this->htmlParser.dump.find("NOINDEX");
-	
-	if (found == string::npos) {
-	  string url = currentArticle.getLongUrl();
-	  
-	  /* Debug output */
-	  if (verbose) {
-	    std::cout << "Indexing " << url << "..." << std::endl;
-	  }
-
-	  /* Get the title */
-	  string accentedTitle = this->htmlParser.title;
-	  if (accentedTitle.empty()) {
-	    accentedTitle = currentArticle.getTitle();
-	  }
-
-	  /* count words */
-	  stringstream countWordStringStream;
-	  countWordStringStream << countWords(this->htmlParser.dump);
-	  const std::string wordCountString = countWordStringStream.str();
-
-	  /* snippet */
-	  std::string snippet = std::string(this->htmlParser.dump, 0, 300);
-	  std::string::size_type last = snippet.find_last_of('.');
-	  if (last == snippet.npos)
-	    last = snippet.find_last_of(' ');
-	  if (last != snippet.npos)
-	    snippet = snippet.substr(0, last);
-
-	  /* size */
-	  stringstream sizeStringStream;
-	  sizeStringStream << content.size() / 1024;
-	  const std::string size = sizeStringStream.str();
-
-	  this->indexNextArticle(url, 
-				 accentedTitle,
-				 removeAccents(this->htmlParser.title), 
-				 removeAccents(this->htmlParser.keywords),
-				 removeAccents(this->htmlParser.dump),
-				 snippet,
-				 size,
-				 wordCountString
-				 );
-
-	}
-      }
-    }
-
-    this->indexNextPercentPost();
-    
-    /* increment the offset and set returned value */
-    if (this->currentArticleOffset <= this->lastArticleOffset) {
-      return true;
-    } else {
-        // commented as it never returns on OSX.
-      //this->stopIndexing();
-      return false;
-    }
-  }
-
 }
--- a/src/common/kiwix/indexer.h
+++ b/src/common/kiwix/indexer.h
@ -54,17 +54,14 @@ namespace kiwix {
  class Indexer {
    
  public:
-    Indexer(const string &zimFilePath);
-    bool indexNextPercent(const bool &verbose = false);
-    bool setZimFilePath(const string &zimFilePath);
-    bool start();
+    Indexer();
+    bool start(const string &zimPath, const string &indexPath);
    bool stop();
    bool isRunning();
    unsigned int getProgression();

  private:
    pthread_mutex_t threadIdsMutex;
-    void initialize();

    /* Article extraction */
    pthread_t articleExtractor;
@ -107,46 +104,47 @@ namespace kiwix {
    /* Article Count & Progression */
    unsigned int articleCount;
    pthread_mutex_t articleCountMutex;
-    void setArticleCount(unsigned int articleCount);
+    void setArticleCount(const unsigned int articleCount);
    unsigned int getArticleCount();
+
+    /* Progression */
    unsigned int progression;
    pthread_mutex_t progressionMutex;
-    void setProgression(unsigned int progression);
+    void setProgression(const unsigned int progression);
+    /* getProgression() is public */
+
+    /* ZIM path */
+    pthread_mutex_t zimPathMutex;
+    string zimPath;
+    void setZimPath(const string path);
+    string getZimPath();
+
+    /* Index path */
+    pthread_mutex_t indexPathMutex;
+    string indexPath;
+    void setIndexPath(const string path);
+    string getIndexPath();

  protected:
-    virtual void indexNextPercentPre() = 0;
-    virtual void indexNextArticle(const string &url, 
-				  const string &title, 
-				  const string &unaccentedTitle,
-				  const string &keywords, 
-				  const string &content,
-				  const string &snippet,
-				  const string &size,
-				  const string &wordCount) = 0;
-    virtual void indexNextPercentPost() = 0;
-    virtual void stopIndexing() = 0;
-
-    /* Article offset */
-    void setCurrentArticleOffset(unsigned int offset);
-    unsigned int getCurrentArticleOffset();
-
-    /* ZIM file handling */
-    zim::File* zimFileHandler;
-    zim::size_type firstArticleOffset;
-    zim::size_type lastArticleOffset;
-    zim::size_type currentArticleOffset;
+    virtual void indexingPrelude(const string &indexPath) = 0;
+    virtual void index(const string &url, 
+		       const string &title, 
+		       const string &unaccentedTitle,
+		       const string &keywords, 
+		       const string &content,
+		       const string &snippet,
+		       const string &size,
+		       const string &wordCount) = 0;
+    virtual void flush() = 0;
+    virtual void indexingPostlude() = 0;
    
-    /* HTML parsing */
-    MyHtmlParser htmlParser;
+    /* Others */
    unsigned int countWords(const string &text);

    /* Stopwords */
    bool readStopWordsFile(const string path);
    std::vector<std::string> stopWords;

-    /* Others */
-    float stepSize;
-
    /* Boost factor */
    unsigned int keywordsBoostFactor;
    inline unsigned int getTitleBoostFactor(const unsigned int contentLength) {
--- a/src/common/kiwix/reader.cpp
+++ b/src/common/kiwix/reader.cpp
@ -62,6 +62,10 @@ namespace kiwix {
      delete this->zimFileHandler;
    }
  }
+
+  zim::File* Reader::getZimFileHandler() {
+    return this->zimFileHandler;
+  }
  
  /* Reset the cursor for GetNextArticle() */
  void Reader::reset() {
--- a/src/common/kiwix/reader.h
+++ b/src/common/kiwix/reader.h
@ -63,6 +63,7 @@ namespace kiwix {
    bool canCheckIntegrity();
    bool isCorrupted();
    bool parseUrl(const string &urlStr, char *ns, string &titleStr);
+    zim::File* getZimFileHandler();
    
  protected:
    zim::File* zimFileHandler;
--- a/src/common/kiwix/xapianIndexer.cpp
+++ b/src/common/kiwix/xapianIndexer.cpp
@ -22,20 +22,15 @@
 namespace kiwix {

  /* Constructor */
-  XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) :
-    Indexer(zimFilePath) {
-    
-    /* Open the Xapian directory */
-    this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, 
-							  Xapian::DB_CREATE_OR_OVERWRITE);
-
+  XapianIndexer::XapianIndexer() {
    /* Stemming */
    /*
    stemmer = Xapian::Stem("french");
    indexer.set_stemmer(stemmer);
    */

-    /* Stop words
+    /* Stop words */
+    /*
    std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
    this->stopper.add("ceci");
    while (stopWordsIterator != this->stopWords.end()) {
@ -46,19 +41,20 @@ namespace kiwix {
    */
  }
  
-  void XapianIndexer::indexNextPercentPre() {
-    this->writableDatabase->begin_transaction(true);
+  void XapianIndexer::indexingPrelude(const string &indexPath) {
+    this->writableDatabase = Xapian::WritableDatabase(indexPath, Xapian::DB_CREATE_OR_OVERWRITE);
+    this->writableDatabase.begin_transaction(true);
  }
  
-  void XapianIndexer::indexNextArticle(const string &url, 
-				       const string &title, 
-				       const string &unaccentedTitle,
-				       const string &keywords, 
-				       const string &content,
-				       const string &snippet,
-				       const string &size,
-				       const string &wordCount) {
-
+  void XapianIndexer::index(const string &url, 
+			    const string &title, 
+			    const string &unaccentedTitle,
+			    const string &keywords, 
+			    const string &content,
+			    const string &snippet,
+			    const string &size,
+			    const string &wordCount) {
+    
    /* Put the data in the document */
    Xapian::Document currentDocument; 
    currentDocument.clear_values();
@ -85,26 +81,17 @@ namespace kiwix {
    }
    
    /* add to the database */
-    this->writableDatabase->add_document(currentDocument);
-  }
-
-  void XapianIndexer::indexNextPercentPost() {
-    /* Flush and close Xapian transaction*/
-    this->writableDatabase->commit_transaction();
+    this->writableDatabase.add_document(currentDocument);
  }
  
-  /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
-  void XapianIndexer::stopIndexing() {
-    /* Delete the zimFileHandler */
-    if (this->zimFileHandler != NULL) {
-      delete this->zimFileHandler;
-      this->zimFileHandler = NULL;
-    }
-    
-    /* Delete the Xapian writableDatabase */
-    if (this->writableDatabase != NULL) {
-      delete this->writableDatabase;
-      this->writableDatabase = NULL;
-    }
+  void XapianIndexer::flush() {
+    this->writableDatabase.commit_transaction();
+    this->writableDatabase.begin_transaction(true);
+  }
+
+  void XapianIndexer::indexingPostlude() {
+    this->flush();
+    this->writableDatabase.commit_transaction();
+    this->writableDatabase.commit();
  }
 }
--- a/src/common/kiwix/xapianIndexer.h
+++ b/src/common/kiwix/xapianIndexer.h
@ -30,22 +30,23 @@ namespace kiwix {
  class XapianIndexer : public Indexer {
    
  public:
-    XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath);
+    XapianIndexer();
+    ~XapianIndexer();
    
  protected:
-    void indexNextPercentPre();
-    void indexNextArticle(const string &url, 
-			  const string &title, 
-			  const string &unaccentedTitle,
-			  const string &keywords, 
-			  const string &content,
-			  const string &snippet,
-			  const string &size,
-			  const string &wordCount);
-    void indexNextPercentPost();
-    void stopIndexing();
+    void indexingPrelude(const string &indexPath);
+    void index(const string &url, 
+	       const string &title, 
+	       const string &unaccentedTitle,
+	       const string &keywords, 
+	       const string &content,
+	       const string &snippet,
+	       const string &size,
+	       const string &wordCount);
+    void flush();
+    void indexingPostlude();
    
-    Xapian::WritableDatabase *writableDatabase;
+    Xapian::WritableDatabase writableDatabase;
    Xapian::Stem stemmer;
    Xapian::SimpleStopper stopper;
    Xapian::TermGenerator indexer;