mirror of https://github.com/kiwix/libkiwix.git
+ factorizing of the kiwix::Indexer class in kiwix::Indexer (abstract) and kiwix::XapianIndexer
This commit is contained in:
parent
d52c86bcac
commit
ef2423b1a7
|
@ -20,158 +20,9 @@ namespace kiwix {
|
|||
: zimFileHandler(NULL),
|
||||
articleCount(0),
|
||||
stepSize(0) {
|
||||
|
||||
|
||||
/* Open the ZIM file */
|
||||
this->zimFileHandler = new zim::File(zimFilePath);
|
||||
|
||||
/* Open the Xapian directory */
|
||||
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
|
||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
|
||||
/* Stemming */
|
||||
/*
|
||||
stemmer = Xapian::Stem("french");
|
||||
indexer.set_stemmer(stemmer);
|
||||
*/
|
||||
|
||||
/* Read the stopwords file */
|
||||
/*
|
||||
this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
|
||||
this->stopper.add("ceci");
|
||||
while (stopWordsIterator != this->stopWords.end()) {
|
||||
this->stopper.add(*stopWordsIterator);
|
||||
stopWordsIterator++;
|
||||
}
|
||||
indexer.set_stopper(&(this->stopper));
|
||||
*/
|
||||
|
||||
/* Prepare the indexation */
|
||||
this->prepareIndexing();
|
||||
}
|
||||
|
||||
/* Destructor */
|
||||
Indexer::~Indexer() {
|
||||
this->stopIndexing();
|
||||
}
|
||||
|
||||
/* Start indexing */
|
||||
void Indexer::prepareIndexing() {
|
||||
|
||||
/* Define a few values */
|
||||
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
|
||||
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
|
||||
this->currentArticleOffset = this->firstArticleOffset;
|
||||
|
||||
/* Compute few things */
|
||||
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
||||
this->stepSize = (float)this->articleCount / (float)100;
|
||||
}
|
||||
|
||||
/* Index next percent */
|
||||
bool Indexer::indexNextPercent(const bool &verbose) {
|
||||
float thresholdOffset = this->currentArticleOffset + this->stepSize;
|
||||
size_t found;
|
||||
|
||||
/* Check if we can start */
|
||||
if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Begin the Xapian transation */
|
||||
this->writableDatabase->begin_transaction(true);
|
||||
|
||||
while(this->currentArticleOffset < thresholdOffset &&
|
||||
this->currentArticleOffset < this->lastArticleOffset) {
|
||||
|
||||
zim::Article currentArticle;
|
||||
Xapian::Document currentDocument;
|
||||
|
||||
/* Get next non redirect article */
|
||||
do {
|
||||
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
|
||||
} while (this->currentArticleOffset++ &&
|
||||
currentArticle.isRedirect() &&
|
||||
this->currentArticleOffset != this->lastArticleOffset);
|
||||
|
||||
if (!currentArticle.isRedirect()) {
|
||||
|
||||
/* Index the content */
|
||||
this->htmlParser.reset();
|
||||
string content (currentArticle.getData().data(), currentArticle.getData().size());
|
||||
|
||||
/* The parser generate a lot of exceptions which should be avoided */
|
||||
try {
|
||||
this->htmlParser.parse_html(content, "UTF-8", true);
|
||||
} catch (...) {
|
||||
}
|
||||
|
||||
/* If content does not have the noindex meta tag */
|
||||
/* Seems that the parser generates an exception in such case */
|
||||
found = this->htmlParser.dump.find("NOINDEX");
|
||||
|
||||
if (found == string::npos) {
|
||||
|
||||
/* Put the data in the document */
|
||||
currentDocument.clear_values();
|
||||
currentDocument.add_value(0, this->htmlParser.title);
|
||||
currentDocument.set_data(currentArticle.getLongUrl().c_str());
|
||||
indexer.set_document(currentDocument);
|
||||
|
||||
/* Debug output */
|
||||
if (verbose) {
|
||||
std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
|
||||
}
|
||||
|
||||
/* Index the title */
|
||||
if (!this->htmlParser.title.empty()) {
|
||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.title),
|
||||
((this->htmlParser.dump.size() / 100) + 1) /
|
||||
countWords(this->htmlParser.title) );
|
||||
}
|
||||
|
||||
/* Index the keywords */
|
||||
if (!this->htmlParser.keywords.empty()) {
|
||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3);
|
||||
}
|
||||
|
||||
/* Index the content */
|
||||
if (!this->htmlParser.dump.empty()) {
|
||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.dump));
|
||||
}
|
||||
|
||||
/* add to the database */
|
||||
this->writableDatabase->add_document(currentDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Flush and close Xapian transaction*/
|
||||
this->writableDatabase->commit_transaction();
|
||||
|
||||
/* increment the offset and set returned value */
|
||||
if (this->currentArticleOffset < this->lastArticleOffset) {
|
||||
this->currentArticleOffset++;
|
||||
return true;
|
||||
} else {
|
||||
this->stopIndexing();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
|
||||
void Indexer::stopIndexing() {
|
||||
/* Delete the zimFileHandler */
|
||||
if (this->zimFileHandler != NULL) {
|
||||
delete this->zimFileHandler;
|
||||
this->zimFileHandler = NULL;
|
||||
}
|
||||
|
||||
/* Delete the Xapian writableDatabase */
|
||||
if (this->writableDatabase != NULL) {
|
||||
delete this->writableDatabase;
|
||||
this->writableDatabase = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Read the file containing the stopwords */
|
||||
|
|
|
@ -21,34 +21,30 @@ namespace kiwix {
|
|||
|
||||
public:
|
||||
Indexer(const string &zimFilePath, const string &xapianDirectoryPath);
|
||||
~Indexer();
|
||||
|
||||
bool indexNextPercent(const bool &verbose = false);
|
||||
virtual bool indexNextPercent(const bool &verbose = false) = 0;
|
||||
|
||||
protected:
|
||||
void prepareIndexing();
|
||||
void stopIndexing();
|
||||
unsigned int countWords(const string &text);
|
||||
|
||||
bool readStopWordsFile(const string path);
|
||||
|
||||
unsigned int articleCount;
|
||||
float stepSize;
|
||||
|
||||
virtual void prepareIndexing() = 0;
|
||||
virtual void stopIndexing() = 0;
|
||||
|
||||
/* ZIM file handling */
|
||||
zim::File* zimFileHandler;
|
||||
zim::size_type firstArticleOffset;
|
||||
zim::size_type lastArticleOffset;
|
||||
zim::size_type currentArticleOffset;
|
||||
|
||||
Xapian::WritableDatabase *writableDatabase;
|
||||
Xapian::Stem stemmer;
|
||||
Xapian::SimpleStopper stopper;
|
||||
Xapian::TermGenerator indexer;
|
||||
|
||||
std::vector<std::string> stopWords;
|
||||
/* HTML parsing */
|
||||
MyHtmlParser htmlParser;
|
||||
};
|
||||
unsigned int countWords(const string &text);
|
||||
|
||||
/* Stopwords */
|
||||
bool readStopWordsFile(const string path);
|
||||
std::vector<std::string> stopWords;
|
||||
|
||||
/* Others */
|
||||
unsigned int articleCount;
|
||||
float stepSize;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,158 @@
|
|||
#include "xapianIndexer.h"
|
||||
|
||||
namespace kiwix {
|
||||
|
||||
/* Constructor */
|
||||
XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) :
|
||||
Indexer(zimFilePath, xapianDirectoryPath) {
|
||||
|
||||
/* Open the Xapian directory */
|
||||
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
|
||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
|
||||
/* Stemming */
|
||||
/*
|
||||
stemmer = Xapian::Stem("french");
|
||||
indexer.set_stemmer(stemmer);
|
||||
*/
|
||||
|
||||
/* Read the stopwords file */
|
||||
/*
|
||||
this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
|
||||
this->stopper.add("ceci");
|
||||
while (stopWordsIterator != this->stopWords.end()) {
|
||||
this->stopper.add(*stopWordsIterator);
|
||||
stopWordsIterator++;
|
||||
}
|
||||
indexer.set_stopper(&(this->stopper));
|
||||
*/
|
||||
|
||||
/* Prepare the indexation */
|
||||
this->prepareIndexing();
|
||||
}
|
||||
|
||||
/* Destructor */
|
||||
XapianIndexer::~XapianIndexer() {
|
||||
this->stopIndexing();
|
||||
}
|
||||
|
||||
/* Start indexing */
|
||||
void XapianIndexer::prepareIndexing() {
|
||||
|
||||
/* Define a few values */
|
||||
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
|
||||
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
|
||||
this->currentArticleOffset = this->firstArticleOffset;
|
||||
|
||||
/* Compute few things */
|
||||
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
||||
this->stepSize = (float)this->articleCount / (float)100;
|
||||
}
|
||||
|
||||
/* Index next percent */
|
||||
bool XapianIndexer::indexNextPercent(const bool &verbose) {
|
||||
float thresholdOffset = this->currentArticleOffset + this->stepSize;
|
||||
size_t found;
|
||||
|
||||
/* Check if we can start */
|
||||
if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Begin the Xapian transation */
|
||||
this->writableDatabase->begin_transaction(true);
|
||||
|
||||
while(this->currentArticleOffset < thresholdOffset &&
|
||||
this->currentArticleOffset < this->lastArticleOffset) {
|
||||
|
||||
zim::Article currentArticle;
|
||||
Xapian::Document currentDocument;
|
||||
|
||||
/* Get next non redirect article */
|
||||
do {
|
||||
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
|
||||
} while (this->currentArticleOffset++ &&
|
||||
currentArticle.isRedirect() &&
|
||||
this->currentArticleOffset != this->lastArticleOffset);
|
||||
|
||||
if (!currentArticle.isRedirect()) {
|
||||
|
||||
/* Index the content */
|
||||
this->htmlParser.reset();
|
||||
string content (currentArticle.getData().data(), currentArticle.getData().size());
|
||||
|
||||
/* The parser generate a lot of exceptions which should be avoided */
|
||||
try {
|
||||
this->htmlParser.parse_html(content, "UTF-8", true);
|
||||
} catch (...) {
|
||||
}
|
||||
|
||||
/* If content does not have the noindex meta tag */
|
||||
/* Seems that the parser generates an exception in such case */
|
||||
found = this->htmlParser.dump.find("NOINDEX");
|
||||
|
||||
if (found == string::npos) {
|
||||
|
||||
/* Put the data in the document */
|
||||
currentDocument.clear_values();
|
||||
currentDocument.add_value(0, this->htmlParser.title);
|
||||
currentDocument.set_data(currentArticle.getLongUrl().c_str());
|
||||
indexer.set_document(currentDocument);
|
||||
|
||||
/* Debug output */
|
||||
if (verbose) {
|
||||
std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
|
||||
}
|
||||
|
||||
/* Index the title */
|
||||
if (!this->htmlParser.title.empty()) {
|
||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.title),
|
||||
((this->htmlParser.dump.size() / 100) + 1) /
|
||||
countWords(this->htmlParser.title) );
|
||||
}
|
||||
|
||||
/* Index the keywords */
|
||||
if (!this->htmlParser.keywords.empty()) {
|
||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3);
|
||||
}
|
||||
|
||||
/* Index the content */
|
||||
if (!this->htmlParser.dump.empty()) {
|
||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.dump));
|
||||
}
|
||||
|
||||
/* add to the database */
|
||||
this->writableDatabase->add_document(currentDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Flush and close Xapian transaction*/
|
||||
this->writableDatabase->commit_transaction();
|
||||
|
||||
/* increment the offset and set returned value */
|
||||
if (this->currentArticleOffset < this->lastArticleOffset) {
|
||||
this->currentArticleOffset++;
|
||||
return true;
|
||||
} else {
|
||||
this->stopIndexing();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
|
||||
void XapianIndexer::stopIndexing() {
|
||||
/* Delete the zimFileHandler */
|
||||
if (this->zimFileHandler != NULL) {
|
||||
delete this->zimFileHandler;
|
||||
this->zimFileHandler = NULL;
|
||||
}
|
||||
|
||||
/* Delete the Xapian writableDatabase */
|
||||
if (this->writableDatabase != NULL) {
|
||||
delete this->writableDatabase;
|
||||
this->writableDatabase = NULL;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
#ifndef KIWIX_XAPIAN_INDEXER_H
|
||||
#define KIWIX_XAPIAN_INDEXER_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include <xapian.h>
|
||||
#include <unaccent.h>
|
||||
#include <zim/file.h>
|
||||
#include <zim/article.h>
|
||||
#include <zim/fileiterator.h>
|
||||
#include "xapian/myhtmlparse.h"
|
||||
#include "indexer.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace kiwix {
|
||||
|
||||
class XapianIndexer : public Indexer {
|
||||
|
||||
public:
|
||||
XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath);
|
||||
~XapianIndexer();
|
||||
|
||||
bool indexNextPercent(const bool &verbose = false);
|
||||
|
||||
protected:
|
||||
void prepareIndexing();
|
||||
void stopIndexing();
|
||||
|
||||
Xapian::WritableDatabase *writableDatabase;
|
||||
Xapian::Stem stemmer;
|
||||
Xapian::SimpleStopper stopper;
|
||||
Xapian::TermGenerator indexer;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue