+ first version of stopwords support

This commit is contained in:
kelson42 2012-09-01 11:54:23 +00:00
parent c1961faeb4
commit 10d855ced5
4 changed files with 729 additions and 361 deletions

View File

@ -51,9 +51,18 @@ namespace kiwix {
pthread_mutex_init(&indexPathMutex, NULL);
pthread_mutex_init(&progressionMutex, NULL);
pthread_mutex_init(&verboseMutex, NULL);
}
/* Read the stopwords file */
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
/* Read the stopwords */
void Indexer::readStopWords(const string languageCode) {
std::string stopWord;
std::istringstream file(getResourceAsString("stopwords/" + languageCode));
this->stopWords.clear();
while (getline(file, stopWord, '\n')) {
this->stopWords.push_back(stopWord);
}
}
/* Article extractor methods */
@ -66,6 +75,9 @@ namespace kiwix {
unsigned int articleCount = reader.getArticleCount();
self->setArticleCount(articleCount);
/* StopWords */
self->readStopWords(reader.getLanguage());
/* Goes trough all articles */
zim::File *zimHandler = reader.getZimFileHandler();
unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');
@ -390,7 +402,6 @@ namespace kiwix {
/* Manage */
bool Indexer::start(const string zimPath, const string indexPath) {
if (this->getVerboseFlag()) {
std::cout << "Indexing of '" << zimPath << "' starting..." <<std::endl;
}
@ -451,21 +462,6 @@ namespace kiwix {
return true;
}
/* Read the file containing the stopwords */
bool Indexer::readStopWordsFile(const string path) {
std::string stopWord;
std::ifstream file(path.c_str(), std::ios_base::in);
this->stopWords.clear();
while (getline(file, stopWord, '\n')) {
this->stopWords.push_back(stopWord);
}
std::cout << "Read " << this->stopWords.size() << " lines.\n";
return true;
}
/* Manage the verboseFlag */
void Indexer::setVerboseFlag(const bool value) {
pthread_mutex_lock(&verboseMutex);

View File

@ -36,6 +36,7 @@
#include <pthread.h>
#include <stringTools.h>
#include <resourceTools.h>
#include <zim/file.h>
#include <zim/article.h>
#include <zim/fileiterator.h>
@ -80,13 +81,13 @@ namespace kiwix {
virtual void flush() = 0;
virtual void indexingPostlude() = 0;
/* Stop words */
std::vector<std::string> stopWords;
void readStopWords(const string languageCode);
/* Others */
unsigned int countWords(const string &text);
/* Stopwords */
bool readStopWordsFile(const string path);
std::vector<std::string> stopWords;
/* Boost factor */
unsigned int keywordsBoostFactor;
inline unsigned int getTitleBoostFactor(const unsigned int contentLength) {

View File

@ -44,6 +44,16 @@ namespace kiwix {
void XapianIndexer::indexingPrelude(const string indexPath) {
this->writableDatabase = Xapian::WritableDatabase(indexPath, Xapian::DB_CREATE_OR_OVERWRITE);
this->writableDatabase.begin_transaction(true);
/* Insert the stopwords */
if (!this->stopWords.empty()) {
typename std::vector<std::string>::iterator it = this->stopWords.begin();
for( ; it != this->stopWords.end(); ++it) {
this->stopper.add(*it);
}
this->indexer.set_stopper(&(this->stopper));
}
}
void XapianIndexer::index(const string &url,

File diff suppressed because it is too large Load Diff