mirror of https://github.com/kiwix/libkiwix.git
+ stopwords code stub
This commit is contained in:
parent
fe1095082c
commit
c22049ea8c
|
@ -18,7 +18,6 @@ namespace kiwix {
|
||||||
/* Constructor */
|
/* Constructor */
|
||||||
Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath)
|
Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath)
|
||||||
: zimFileHandler(NULL),
|
: zimFileHandler(NULL),
|
||||||
stemmer(Xapian::Stem("english")),
|
|
||||||
articleCount(0),
|
articleCount(0),
|
||||||
stepSize(0) {
|
stepSize(0) {
|
||||||
|
|
||||||
|
@ -29,6 +28,24 @@ namespace kiwix {
|
||||||
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
|
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
|
||||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||||
|
|
||||||
|
/* Stemming *
|
||||||
|
/*
|
||||||
|
stemmer = Xapian::Stem("french");
|
||||||
|
indexer.set_stemmer(stemmer);
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Read the stopwords file */
|
||||||
|
/*
|
||||||
|
this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||||
|
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
|
||||||
|
this->stopper.add("ceci");
|
||||||
|
while (stopWordsIterator != this->stopWords.end()) {
|
||||||
|
this->stopper.add(*stopWordsIterator);
|
||||||
|
stopWordsIterator++;
|
||||||
|
}
|
||||||
|
indexer.set_stopper(&(this->stopper));
|
||||||
|
*/
|
||||||
|
|
||||||
/* Prepare the indexation */
|
/* Prepare the indexation */
|
||||||
this->prepareIndexing();
|
this->prepareIndexing();
|
||||||
}
|
}
|
||||||
|
@ -89,10 +106,6 @@ namespace kiwix {
|
||||||
|
|
||||||
if (found == string::npos) {
|
if (found == string::npos) {
|
||||||
|
|
||||||
/* Set the stemmer */
|
|
||||||
/* TODO, autodetect the language */
|
|
||||||
//indexer.set_stemmer(stemmer);
|
|
||||||
|
|
||||||
/* Put the data in the document */
|
/* Put the data in the document */
|
||||||
Xapian::Document document;
|
Xapian::Document document;
|
||||||
document.add_value(0, this->htmlParser.title);
|
document.add_value(0, this->htmlParser.title);
|
||||||
|
@ -153,4 +166,18 @@ namespace kiwix {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Read the file containing the stopwords */
|
||||||
|
bool Indexer::readStopWordsFile(const string path) {
|
||||||
|
std::string stopWord;
|
||||||
|
std::ifstream file(path.c_str(), std::ios_base::in);
|
||||||
|
|
||||||
|
this->stopWords.clear();
|
||||||
|
|
||||||
|
while (getline(file, stopWord, '\n')) {
|
||||||
|
this->stopWords.push_back(stopWord);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Read " << this->stopWords.size() << " lines.\n";
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,11 @@
|
||||||
#ifndef KIWIX_INDEXER_H
|
#ifndef KIWIX_INDEXER_H
|
||||||
#define KIWIX_INDEXER_H
|
#define KIWIX_INDEXER_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
#include <xapian.h>
|
#include <xapian.h>
|
||||||
#include <unaccent.h>
|
#include <unaccent.h>
|
||||||
#include <zim/file.h>
|
#include <zim/file.h>
|
||||||
|
@ -25,6 +30,8 @@ namespace kiwix {
|
||||||
void stopIndexing();
|
void stopIndexing();
|
||||||
unsigned int countWords(const string &text);
|
unsigned int countWords(const string &text);
|
||||||
|
|
||||||
|
bool readStopWordsFile(const string path);
|
||||||
|
|
||||||
unsigned int articleCount;
|
unsigned int articleCount;
|
||||||
float stepSize;
|
float stepSize;
|
||||||
|
|
||||||
|
@ -36,8 +43,10 @@ namespace kiwix {
|
||||||
|
|
||||||
Xapian::WritableDatabase *writableDatabase;
|
Xapian::WritableDatabase *writableDatabase;
|
||||||
Xapian::Stem stemmer;
|
Xapian::Stem stemmer;
|
||||||
|
Xapian::SimpleStopper stopper;
|
||||||
Xapian::TermGenerator indexer;
|
Xapian::TermGenerator indexer;
|
||||||
|
|
||||||
|
std::vector<std::string> stopWords;
|
||||||
MyHtmlParser htmlParser;
|
MyHtmlParser htmlParser;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue