add "eng" to stop word map + pragma mark in indexer.cpp + progress calculation

This commit is contained in:
Chris Li 2016-06-07 14:32:11 -04:00 committed by kelson42
parent 566a01ce7f
commit 801e622644
3 changed files with 37 additions and 17 deletions

View File

@ -71,8 +71,14 @@ namespace kiwix {
while (getline(file, stopWord, '\n')) { while (getline(file, stopWord, '\n')) {
this->stopWords.push_back(stopWord); this->stopWords.push_back(stopWord);
} }
if (this->verboseFlag) {
std::cout << "Read stop words, lang code:" << languageCode << ", count:" << this->stopWords.size() << std::endl;
}
} }
#pragma mark - Extractor
/* Article extractor methods */ /* Article extractor methods */
void *Indexer::extractArticles(void *ptr) { void *Indexer::extractArticles(void *ptr) {
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
@ -89,7 +95,7 @@ namespace kiwix {
unsigned int readArticleCount = 0; unsigned int readArticleCount = 0;
unsigned int currentProgression = 0; unsigned int currentProgression = 0;
self->setProgression(currentProgression); self->setProgression(currentProgression);
unsigned int tmpProgression; unsigned int newProgress;
/* StopWords */ /* StopWords */
self->readStopWords(reader.getLanguage()); self->readStopWords(reader.getLanguage());
@ -101,35 +107,34 @@ namespace kiwix {
zim::Article currentArticle; zim::Article currentArticle;
while (currentOffset < lastOffset) { while (currentOffset < lastOffset) {
if (self->getVerboseFlag()) { // if (self->getVerboseFlag()) {
std::cout << "currentOffset:" << currentOffset << " lastOffset:" << lastOffset // std::cout << "currentOffset:" << currentOffset << " lastOffset:" << lastOffset
<< " readArticleCount:" << readArticleCount << " totalArticleCount:" << articleCount <<std::endl; // << " readArticleCount:" << readArticleCount << " totalArticleCount:" << articleCount <<std::endl;
} // }
currentArticle = zimHandler->getArticle(currentOffset); currentArticle = zimHandler->getArticle(currentOffset);
if (!currentArticle.isRedirect()) { if (!currentArticle.isRedirect()) {
/* Add articles to the queue */ /* Add articles to the queue */
indexerToken token; indexerToken token;
token.title = currentArticle.getTitle(); token.title = currentArticle.getTitle();
token.url = currentArticle.getLongUrl(); token.url = currentArticle.getLongUrl();
token.content = string(currentArticle.getData().data(), currentArticle.getData().size()); token.content = string(currentArticle.getData().data(), currentArticle.getData().size());
self->pushToParseQueue(token); self->pushToParseQueue(token);
readArticleCount += 1; readArticleCount += 1;
/* Update progress */
if (self->progressCallback) { if (self->progressCallback) {
self->progressCallback(readArticleCount, articleCount); self->progressCallback(readArticleCount, articleCount);
} }
newProgress = (unsigned int)((float)readArticleCount / (float)articleCount * 100);
if (newProgress != currentProgression) {
self->setProgression(newProgress);
}
} }
currentOffset += 1; currentOffset += 1;
/* Update the progression counter (in percent) */
tmpProgression = (unsigned int)((float)readArticleCount/(float)articleCount*100 - 1);
if (tmpProgression > currentProgression) {
currentProgression = tmpProgression;
self->setProgression(currentProgression);
}
/* Test if the thread should be cancelled */ /* Test if the thread should be cancelled */
pthread_testcancel(); pthread_testcancel();
} }
@ -152,6 +157,8 @@ namespace kiwix {
return retVal; return retVal;
} }
#pragma mark - Parser
/* Article parser methods */ /* Article parser methods */
void *Indexer::parseArticles(void *ptr) { void *Indexer::parseArticles(void *ptr) {
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
@ -224,6 +231,8 @@ namespace kiwix {
return retVal; return retVal;
} }
#pragma mark - Indexer
/* Article indexer methods */ /* Article indexer methods */
void *Indexer::indexArticles(void *ptr) { void *Indexer::indexArticles(void *ptr) {
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
@ -281,6 +290,8 @@ namespace kiwix {
return retVal; return retVal;
} }
#pragma mark - Parse Queue
/* ToParseQueue methods */ /* ToParseQueue methods */
bool Indexer::isToParseQueueEmpty() { bool Indexer::isToParseQueueEmpty() {
pthread_mutex_lock(&toParseQueueMutex); pthread_mutex_lock(&toParseQueueMutex);
@ -318,6 +329,8 @@ namespace kiwix {
return true; return true;
} }
#pragma mark - Index Queue
/* ToIndexQueue methods */ /* ToIndexQueue methods */
bool Indexer::isToIndexQueueEmpty() { bool Indexer::isToIndexQueueEmpty() {
pthread_mutex_lock(&toIndexQueueMutex); pthread_mutex_lock(&toIndexQueueMutex);
@ -355,6 +368,8 @@ namespace kiwix {
return true; return true;
} }
#pragma mark - Properties Getter & Setter
/* ZIM & Index methods */ /* ZIM & Index methods */
void Indexer::setZimPath(const string path) { void Indexer::setZimPath(const string path) {
pthread_mutex_lock(&zimPathMutex); pthread_mutex_lock(&zimPathMutex);
@ -421,6 +436,8 @@ namespace kiwix {
return retVal; return retVal;
} }
#pragma mark - Status Management
/* Manage */ /* Manage */
bool Indexer::start(const string zimPath, const string indexPath, ProgressCallback callback) { bool Indexer::start(const string zimPath, const string indexPath, ProgressCallback callback) {
if (this->getVerboseFlag()) { if (this->getVerboseFlag()) {
@ -494,6 +511,8 @@ namespace kiwix {
return true; return true;
} }
#pragma mark - verbose
/* Manage the verboseFlag */ /* Manage the verboseFlag */
void Indexer::setVerboseFlag(const bool value) { void Indexer::setVerboseFlag(const bool value) {
pthread_mutex_lock(&verboseMutex); pthread_mutex_lock(&verboseMutex);

0
src/common/resourceTools.cpp Normal file → Executable file
View File

1
src/common/resourceTools.h Normal file → Executable file
View File

@ -39181,6 +39181,7 @@ const unsigned char server_include_html_part[]={
static std::map<std::string, std::pair<const unsigned char*, unsigned int> > createResourceMap() { static std::map<std::string, std::pair<const unsigned char*, unsigned int> > createResourceMap() {
std::map<std::string, std::pair<const unsigned char*, unsigned int> > m; std::map<std::string, std::pair<const unsigned char*, unsigned int> > m;
m["stopwords/en"] = std::pair <const unsigned char*, unsigned int>(stopwords_en, sizeof stopwords_en); m["stopwords/en"] = std::pair <const unsigned char*, unsigned int>(stopwords_en, sizeof stopwords_en);
m["stopwords/eng"] = std::pair <const unsigned char*, unsigned int>(stopwords_en, sizeof stopwords_en);
m["stopwords/fra"] = std::pair <const unsigned char*, unsigned int>(stopwords_fra, sizeof stopwords_fra); m["stopwords/fra"] = std::pair <const unsigned char*, unsigned int>(stopwords_fra, sizeof stopwords_fra);
m["stopwords/he"] = std::pair <const unsigned char*, unsigned int>(stopwords_he, sizeof stopwords_he); m["stopwords/he"] = std::pair <const unsigned char*, unsigned int>(stopwords_he, sizeof stopwords_he);
m["results.tmpl"] = std::pair <const unsigned char*, unsigned int>(results_tmpl, sizeof results_tmpl); m["results.tmpl"] = std::pair <const unsigned char*, unsigned int>(results_tmpl, sizeof results_tmpl);