mirror of https://github.com/kiwix/libkiwix.git
+ stopwords code stub
This commit is contained in:
parent
fe1095082c
commit
c22049ea8c
|
@ -18,7 +18,6 @@ namespace kiwix {
|
|||
/* Constructor */
|
||||
Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath)
|
||||
: zimFileHandler(NULL),
|
||||
stemmer(Xapian::Stem("english")),
|
||||
articleCount(0),
|
||||
stepSize(0) {
|
||||
|
||||
|
@ -28,7 +27,25 @@ namespace kiwix {
|
|||
/* Open the Xapian directory */
|
||||
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
|
||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
|
||||
|
||||
/* Stemming *
|
||||
/*
|
||||
stemmer = Xapian::Stem("french");
|
||||
indexer.set_stemmer(stemmer);
|
||||
*/
|
||||
|
||||
/* Read the stopwords file */
|
||||
/*
|
||||
this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
|
||||
this->stopper.add("ceci");
|
||||
while (stopWordsIterator != this->stopWords.end()) {
|
||||
this->stopper.add(*stopWordsIterator);
|
||||
stopWordsIterator++;
|
||||
}
|
||||
indexer.set_stopper(&(this->stopper));
|
||||
*/
|
||||
|
||||
/* Prepare the indexation */
|
||||
this->prepareIndexing();
|
||||
}
|
||||
|
@ -89,10 +106,6 @@ namespace kiwix {
|
|||
|
||||
if (found == string::npos) {
|
||||
|
||||
/* Set the stemmer */
|
||||
/* TODO, autodetect the language */
|
||||
//indexer.set_stemmer(stemmer);
|
||||
|
||||
/* Put the data in the document */
|
||||
Xapian::Document document;
|
||||
document.add_value(0, this->htmlParser.title);
|
||||
|
@ -152,5 +165,19 @@ namespace kiwix {
|
|||
this->writableDatabase = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Read the file containing the stopwords */
|
||||
bool Indexer::readStopWordsFile(const string path) {
|
||||
std::string stopWord;
|
||||
std::ifstream file(path.c_str(), std::ios_base::in);
|
||||
|
||||
this->stopWords.clear();
|
||||
|
||||
while (getline(file, stopWord, '\n')) {
|
||||
this->stopWords.push_back(stopWord);
|
||||
}
|
||||
|
||||
std::cout << "Read " << this->stopWords.size() << " lines.\n";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
#ifndef KIWIX_INDEXER_H
|
||||
#define KIWIX_INDEXER_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include <xapian.h>
|
||||
#include <unaccent.h>
|
||||
#include <zim/file.h>
|
||||
|
@ -11,7 +16,7 @@
|
|||
using namespace std;
|
||||
|
||||
namespace kiwix {
|
||||
|
||||
|
||||
class Indexer {
|
||||
|
||||
public:
|
||||
|
@ -25,6 +30,8 @@ namespace kiwix {
|
|||
void stopIndexing();
|
||||
unsigned int countWords(const string &text);
|
||||
|
||||
bool readStopWordsFile(const string path);
|
||||
|
||||
unsigned int articleCount;
|
||||
float stepSize;
|
||||
|
||||
|
@ -36,8 +43,10 @@ namespace kiwix {
|
|||
|
||||
Xapian::WritableDatabase *writableDatabase;
|
||||
Xapian::Stem stemmer;
|
||||
Xapian::SimpleStopper stopper;
|
||||
Xapian::TermGenerator indexer;
|
||||
|
||||
|
||||
std::vector<std::string> stopWords;
|
||||
MyHtmlParser htmlParser;
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue