From 6e66fd176d3e6e90379bd0825758a1010c3a9e08 Mon Sep 17 00:00:00 2001
From: kelson42 <kelson42@users.sourceforge.net>
Date: Thu, 29 Mar 2012 13:33:15 +0000
Subject: [PATCH] + further dev of the multithreader indexer

---
 src/common/kiwix/indexer.cpp | 45 ++++++++++++++++++++++++++++++++++--
 src/common/kiwix/indexer.h   |  1 +
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/src/common/kiwix/indexer.cpp b/src/common/kiwix/indexer.cpp
index bf6443dff..be946a184 100644
--- a/src/common/kiwix/indexer.cpp
+++ b/src/common/kiwix/indexer.cpp
@@ -84,6 +84,8 @@ namespace kiwix {
       /* Add articles to the queue */
       indexerArticleToken token;
       token.title = currentArticle.getTitle();
+      token.url = currentArticle.getLongUrl();
+      token.content = string(currentArticle.getData().data(), currentArticle.getData().size());
       self->pushArticleToQueue(token);
 
       /* Test if the thread should be cancelled */
@@ -106,11 +108,12 @@ namespace kiwix {
     pthread_mutex_lock(&articleQueueMutex); 
     this->articleQueue.push(token);
     pthread_mutex_unlock(&articleQueueMutex); 
+    sleep(int(this->articleQueue.size() / 200) / 10);
   }
 
   bool Indexer::popArticleFromQueue(indexerArticleToken &token) {
     while (this->isArticleQueueEmpty() && this->isArticleExtractorRunning()) {
-      sleep(1);
+      sleep(0.5);
     }
 
     if (!this->isArticleQueueEmpty()) {
@@ -127,12 +130,50 @@ namespace kiwix {
 
   void *Indexer::parseArticles(void *ptr) {
     kiwix::Indexer *self = (kiwix::Indexer *)ptr;
+    size_t found;
     indexerArticleToken token;
+    MyHtmlParser htmlParser;
 
     while (self->popArticleFromQueue(token)) {
       cout << token.title << endl;
-    }
 
+      /* The parser generate a lot of exceptions which should be avoided */
+      try {
+	htmlParser.parse_html(token.content, "UTF-8", true);
+      } catch (...) {
+      }
+
+      /* Get the title */
+      string accentedTitle = htmlParser.title;
+      if (accentedTitle.empty()) {
+	accentedTitle = token.title;
+      }
+      
+      /* If content does not have the noindex meta tag */
+      /* Seems that the parser generates an exception in such case */
+      found = htmlParser.dump.find("NOINDEX");
+      
+      if (found == string::npos) {
+	/* count words */
+	stringstream countWordStringStream;
+	countWordStringStream << self->countWords(htmlParser.dump);
+	const std::string wordCountString = countWordStringStream.str();
+	
+	/* snippet */
+	std::string snippet = std::string(htmlParser.dump, 0, 300);
+	std::string::size_type last = snippet.find_last_of('.');
+	if (last == snippet.npos)
+	  last = snippet.find_last_of(' ');
+	  if (last != snippet.npos)
+	    snippet = snippet.substr(0, last);
+
+	  /* size */
+	  stringstream sizeStringStream;
+	  sizeStringStream << token.content.size() / 1024;
+	  const std::string size = sizeStringStream.str();
+      }
+    }
+    
     pthread_exit(NULL);
     return NULL;
   }
diff --git a/src/common/kiwix/indexer.h b/src/common/kiwix/indexer.h
index aa9aa65f9..fe95c17ea 100644
--- a/src/common/kiwix/indexer.h
+++ b/src/common/kiwix/indexer.h
@@ -41,6 +41,7 @@ namespace kiwix {
 
   struct indexerArticleToken {
     string title;
+    string url;
     string content;
   };