mirror of https://github.com/kiwix/libkiwix.git
+ first version of stopwords support
This commit is contained in:
parent
c1961faeb4
commit
10d855ced5
|
@ -51,9 +51,18 @@ namespace kiwix {
|
|||
pthread_mutex_init(&indexPathMutex, NULL);
|
||||
pthread_mutex_init(&progressionMutex, NULL);
|
||||
pthread_mutex_init(&verboseMutex, NULL);
|
||||
|
||||
/* Read the stopwords file */
|
||||
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||
}
|
||||
|
||||
/* Read the stopwords */
|
||||
void Indexer::readStopWords(const string languageCode) {
|
||||
std::string stopWord;
|
||||
std::istringstream file(getResourceAsString("stopwords/" + languageCode));
|
||||
|
||||
this->stopWords.clear();
|
||||
|
||||
while (getline(file, stopWord, '\n')) {
|
||||
this->stopWords.push_back(stopWord);
|
||||
}
|
||||
}
|
||||
|
||||
/* Article extractor methods */
|
||||
|
@ -66,6 +75,9 @@ namespace kiwix {
|
|||
unsigned int articleCount = reader.getArticleCount();
|
||||
self->setArticleCount(articleCount);
|
||||
|
||||
/* StopWords */
|
||||
self->readStopWords(reader.getLanguage());
|
||||
|
||||
/* Goes trough all articles */
|
||||
zim::File *zimHandler = reader.getZimFileHandler();
|
||||
unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');
|
||||
|
@ -390,7 +402,6 @@ namespace kiwix {
|
|||
|
||||
/* Manage */
|
||||
bool Indexer::start(const string zimPath, const string indexPath) {
|
||||
|
||||
if (this->getVerboseFlag()) {
|
||||
std::cout << "Indexing of '" << zimPath << "' starting..." <<std::endl;
|
||||
}
|
||||
|
@ -451,21 +462,6 @@ namespace kiwix {
|
|||
return true;
|
||||
}
|
||||
|
||||
/* Read the file containing the stopwords */
|
||||
bool Indexer::readStopWordsFile(const string path) {
|
||||
std::string stopWord;
|
||||
std::ifstream file(path.c_str(), std::ios_base::in);
|
||||
|
||||
this->stopWords.clear();
|
||||
|
||||
while (getline(file, stopWord, '\n')) {
|
||||
this->stopWords.push_back(stopWord);
|
||||
}
|
||||
|
||||
std::cout << "Read " << this->stopWords.size() << " lines.\n";
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Manage the verboseFlag */
|
||||
void Indexer::setVerboseFlag(const bool value) {
|
||||
pthread_mutex_lock(&verboseMutex);
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
|
||||
#include <pthread.h>
|
||||
#include <stringTools.h>
|
||||
#include <resourceTools.h>
|
||||
#include <zim/file.h>
|
||||
#include <zim/article.h>
|
||||
#include <zim/fileiterator.h>
|
||||
|
@ -79,14 +80,14 @@ namespace kiwix {
|
|||
const string &wordCount) = 0;
|
||||
virtual void flush() = 0;
|
||||
virtual void indexingPostlude() = 0;
|
||||
|
||||
|
||||
/* Stop words */
|
||||
std::vector<std::string> stopWords;
|
||||
void readStopWords(const string languageCode);
|
||||
|
||||
/* Others */
|
||||
unsigned int countWords(const string &text);
|
||||
|
||||
/* Stopwords */
|
||||
bool readStopWordsFile(const string path);
|
||||
std::vector<std::string> stopWords;
|
||||
|
||||
/* Boost factor */
|
||||
unsigned int keywordsBoostFactor;
|
||||
inline unsigned int getTitleBoostFactor(const unsigned int contentLength) {
|
||||
|
|
|
@ -44,6 +44,16 @@ namespace kiwix {
|
|||
void XapianIndexer::indexingPrelude(const string indexPath) {
|
||||
this->writableDatabase = Xapian::WritableDatabase(indexPath, Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
this->writableDatabase.begin_transaction(true);
|
||||
|
||||
/* Insert the stopwords */
|
||||
if (!this->stopWords.empty()) {
|
||||
typename std::vector<std::string>::iterator it = this->stopWords.begin();
|
||||
for( ; it != this->stopWords.end(); ++it) {
|
||||
this->stopper.add(*it);
|
||||
}
|
||||
|
||||
this->indexer.set_stopper(&(this->stopper));
|
||||
}
|
||||
}
|
||||
|
||||
void XapianIndexer::index(const string &url,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue