+ stopwords code stub

This commit is contained in:
kelson42 2010-04-06 08:21:26 +00:00
parent fe1095082c
commit c22049ea8c
2 changed files with 44 additions and 8 deletions

View File

@ -18,7 +18,6 @@ namespace kiwix {
/* Constructor */
Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath)
: zimFileHandler(NULL),
stemmer(Xapian::Stem("english")),
articleCount(0),
stepSize(0) {
@ -28,7 +27,25 @@ namespace kiwix {
/* Open the Xapian directory */
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
Xapian::DB_CREATE_OR_OVERWRITE);
/* Stemming *
/*
stemmer = Xapian::Stem("french");
indexer.set_stemmer(stemmer);
*/
/* Read the stopwords file */
/*
this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
this->stopper.add("ceci");
while (stopWordsIterator != this->stopWords.end()) {
this->stopper.add(*stopWordsIterator);
stopWordsIterator++;
}
indexer.set_stopper(&(this->stopper));
*/
/* Prepare the indexation */
this->prepareIndexing();
}
@ -89,10 +106,6 @@ namespace kiwix {
if (found == string::npos) {
/* Set the stemmer */
/* TODO, autodetect the language */
//indexer.set_stemmer(stemmer);
/* Put the data in the document */
Xapian::Document document;
document.add_value(0, this->htmlParser.title);
@ -152,5 +165,19 @@ namespace kiwix {
this->writableDatabase = NULL;
}
}
/* Read the file containing the stopwords */
bool Indexer::readStopWordsFile(const string path) {
std::string stopWord;
std::ifstream file(path.c_str(), std::ios_base::in);
this->stopWords.clear();
while (getline(file, stopWord, '\n')) {
this->stopWords.push_back(stopWord);
}
std::cout << "Read " << this->stopWords.size() << " lines.\n";
}
}

View File

@ -1,6 +1,11 @@
#ifndef KIWIX_INDEXER_H
#define KIWIX_INDEXER_H
#include <string>
#include <vector>
#include <fstream>
#include <iostream>
#include <xapian.h>
#include <unaccent.h>
#include <zim/file.h>
@ -11,7 +16,7 @@
using namespace std;
namespace kiwix {
class Indexer {
public:
@ -25,6 +30,8 @@ namespace kiwix {
void stopIndexing();
unsigned int countWords(const string &text);
bool readStopWordsFile(const string path);
unsigned int articleCount;
float stepSize;
@ -36,8 +43,10 @@ namespace kiwix {
Xapian::WritableDatabase *writableDatabase;
Xapian::Stem stemmer;
Xapian::SimpleStopper stopper;
Xapian::TermGenerator indexer;
std::vector<std::string> stopWords;
MyHtmlParser htmlParser;
};