mirror of https://github.com/kiwix/libkiwix.git
+ first version of stopwords support
This commit is contained in:
parent
c1961faeb4
commit
10d855ced5
|
@ -51,9 +51,18 @@ namespace kiwix {
|
||||||
pthread_mutex_init(&indexPathMutex, NULL);
|
pthread_mutex_init(&indexPathMutex, NULL);
|
||||||
pthread_mutex_init(&progressionMutex, NULL);
|
pthread_mutex_init(&progressionMutex, NULL);
|
||||||
pthread_mutex_init(&verboseMutex, NULL);
|
pthread_mutex_init(&verboseMutex, NULL);
|
||||||
|
}
|
||||||
/* Read the stopwords file */
|
|
||||||
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
/* Read the stopwords */
|
||||||
|
void Indexer::readStopWords(const string languageCode) {
|
||||||
|
std::string stopWord;
|
||||||
|
std::istringstream file(getResourceAsString("stopwords/" + languageCode));
|
||||||
|
|
||||||
|
this->stopWords.clear();
|
||||||
|
|
||||||
|
while (getline(file, stopWord, '\n')) {
|
||||||
|
this->stopWords.push_back(stopWord);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Article extractor methods */
|
/* Article extractor methods */
|
||||||
|
@ -66,6 +75,9 @@ namespace kiwix {
|
||||||
unsigned int articleCount = reader.getArticleCount();
|
unsigned int articleCount = reader.getArticleCount();
|
||||||
self->setArticleCount(articleCount);
|
self->setArticleCount(articleCount);
|
||||||
|
|
||||||
|
/* StopWords */
|
||||||
|
self->readStopWords(reader.getLanguage());
|
||||||
|
|
||||||
/* Goes trough all articles */
|
/* Goes trough all articles */
|
||||||
zim::File *zimHandler = reader.getZimFileHandler();
|
zim::File *zimHandler = reader.getZimFileHandler();
|
||||||
unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');
|
unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');
|
||||||
|
@ -390,7 +402,6 @@ namespace kiwix {
|
||||||
|
|
||||||
/* Manage */
|
/* Manage */
|
||||||
bool Indexer::start(const string zimPath, const string indexPath) {
|
bool Indexer::start(const string zimPath, const string indexPath) {
|
||||||
|
|
||||||
if (this->getVerboseFlag()) {
|
if (this->getVerboseFlag()) {
|
||||||
std::cout << "Indexing of '" << zimPath << "' starting..." <<std::endl;
|
std::cout << "Indexing of '" << zimPath << "' starting..." <<std::endl;
|
||||||
}
|
}
|
||||||
|
@ -451,21 +462,6 @@ namespace kiwix {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Read the file containing the stopwords */
|
|
||||||
bool Indexer::readStopWordsFile(const string path) {
|
|
||||||
std::string stopWord;
|
|
||||||
std::ifstream file(path.c_str(), std::ios_base::in);
|
|
||||||
|
|
||||||
this->stopWords.clear();
|
|
||||||
|
|
||||||
while (getline(file, stopWord, '\n')) {
|
|
||||||
this->stopWords.push_back(stopWord);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "Read " << this->stopWords.size() << " lines.\n";
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Manage the verboseFlag */
|
/* Manage the verboseFlag */
|
||||||
void Indexer::setVerboseFlag(const bool value) {
|
void Indexer::setVerboseFlag(const bool value) {
|
||||||
pthread_mutex_lock(&verboseMutex);
|
pthread_mutex_lock(&verboseMutex);
|
||||||
|
|
|
@ -36,6 +36,7 @@
|
||||||
|
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <stringTools.h>
|
#include <stringTools.h>
|
||||||
|
#include <resourceTools.h>
|
||||||
#include <zim/file.h>
|
#include <zim/file.h>
|
||||||
#include <zim/article.h>
|
#include <zim/article.h>
|
||||||
#include <zim/fileiterator.h>
|
#include <zim/fileiterator.h>
|
||||||
|
@ -79,14 +80,14 @@ namespace kiwix {
|
||||||
const string &wordCount) = 0;
|
const string &wordCount) = 0;
|
||||||
virtual void flush() = 0;
|
virtual void flush() = 0;
|
||||||
virtual void indexingPostlude() = 0;
|
virtual void indexingPostlude() = 0;
|
||||||
|
|
||||||
|
/* Stop words */
|
||||||
|
std::vector<std::string> stopWords;
|
||||||
|
void readStopWords(const string languageCode);
|
||||||
|
|
||||||
/* Others */
|
/* Others */
|
||||||
unsigned int countWords(const string &text);
|
unsigned int countWords(const string &text);
|
||||||
|
|
||||||
/* Stopwords */
|
|
||||||
bool readStopWordsFile(const string path);
|
|
||||||
std::vector<std::string> stopWords;
|
|
||||||
|
|
||||||
/* Boost factor */
|
/* Boost factor */
|
||||||
unsigned int keywordsBoostFactor;
|
unsigned int keywordsBoostFactor;
|
||||||
inline unsigned int getTitleBoostFactor(const unsigned int contentLength) {
|
inline unsigned int getTitleBoostFactor(const unsigned int contentLength) {
|
||||||
|
|
|
@ -44,6 +44,16 @@ namespace kiwix {
|
||||||
void XapianIndexer::indexingPrelude(const string indexPath) {
|
void XapianIndexer::indexingPrelude(const string indexPath) {
|
||||||
this->writableDatabase = Xapian::WritableDatabase(indexPath, Xapian::DB_CREATE_OR_OVERWRITE);
|
this->writableDatabase = Xapian::WritableDatabase(indexPath, Xapian::DB_CREATE_OR_OVERWRITE);
|
||||||
this->writableDatabase.begin_transaction(true);
|
this->writableDatabase.begin_transaction(true);
|
||||||
|
|
||||||
|
/* Insert the stopwords */
|
||||||
|
if (!this->stopWords.empty()) {
|
||||||
|
typename std::vector<std::string>::iterator it = this->stopWords.begin();
|
||||||
|
for( ; it != this->stopWords.end(); ++it) {
|
||||||
|
this->stopper.add(*it);
|
||||||
|
}
|
||||||
|
|
||||||
|
this->indexer.set_stopper(&(this->stopper));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void XapianIndexer::index(const string &url,
|
void XapianIndexer::index(const string &url,
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue