mirror of https://github.com/kiwix/libkiwix.git
+ better factori. of the indexer code
This commit is contained in:
parent
ef2423b1a7
commit
c003035a5e
|
@ -16,13 +16,25 @@ namespace kiwix {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Constructor */
|
/* Constructor */
|
||||||
Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath)
|
Indexer::Indexer(const string &zimFilePath)
|
||||||
: zimFileHandler(NULL),
|
: zimFileHandler(NULL),
|
||||||
articleCount(0),
|
articleCount(0),
|
||||||
stepSize(0) {
|
stepSize(0) {
|
||||||
|
|
||||||
/* Open the ZIM file */
|
/* Open the ZIM file */
|
||||||
this->zimFileHandler = new zim::File(zimFilePath);
|
this->zimFileHandler = new zim::File(zimFilePath);
|
||||||
|
|
||||||
|
/* Define a few values */
|
||||||
|
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
|
||||||
|
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
|
||||||
|
this->currentArticleOffset = this->firstArticleOffset;
|
||||||
|
|
||||||
|
/* Compute few things */
|
||||||
|
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
||||||
|
this->stepSize = (float)this->articleCount / (float)100;
|
||||||
|
|
||||||
|
/* Read the stopwords file */
|
||||||
|
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Read the file containing the stopwords */
|
/* Read the file containing the stopwords */
|
||||||
|
@ -39,4 +51,73 @@ namespace kiwix {
|
||||||
std::cout << "Read " << this->stopWords.size() << " lines.\n";
|
std::cout << "Read " << this->stopWords.size() << " lines.\n";
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Index next percent */
|
||||||
|
bool Indexer::indexNextPercent(const bool &verbose) {
|
||||||
|
float thresholdOffset = this->currentArticleOffset + this->stepSize;
|
||||||
|
size_t found;
|
||||||
|
|
||||||
|
/* Check if we can start */
|
||||||
|
if (this->zimFileHandler == NULL) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
this->indexNextPercentPre();
|
||||||
|
|
||||||
|
while(this->currentArticleOffset < thresholdOffset &&
|
||||||
|
this->currentArticleOffset < this->lastArticleOffset) {
|
||||||
|
|
||||||
|
zim::Article currentArticle;
|
||||||
|
|
||||||
|
/* Get next non redirect article */
|
||||||
|
do {
|
||||||
|
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
|
||||||
|
} while (this->currentArticleOffset++ &&
|
||||||
|
currentArticle.isRedirect() &&
|
||||||
|
this->currentArticleOffset != this->lastArticleOffset);
|
||||||
|
|
||||||
|
if (!currentArticle.isRedirect()) {
|
||||||
|
|
||||||
|
/* Index the content */
|
||||||
|
this->htmlParser.reset();
|
||||||
|
string content (currentArticle.getData().data(), currentArticle.getData().size());
|
||||||
|
|
||||||
|
/* The parser generate a lot of exceptions which should be avoided */
|
||||||
|
try {
|
||||||
|
this->htmlParser.parse_html(content, "UTF-8", true);
|
||||||
|
} catch (...) {
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If content does not have the noindex meta tag */
|
||||||
|
/* Seems that the parser generates an exception in such case */
|
||||||
|
found = this->htmlParser.dump.find("NOINDEX");
|
||||||
|
|
||||||
|
if (found == string::npos) {
|
||||||
|
string url = currentArticle.getLongUrl();
|
||||||
|
|
||||||
|
/* Debug output */
|
||||||
|
if (verbose) {
|
||||||
|
std::cout << "Indexing " << url << "..." << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
this->indexNextArticle(url, this->htmlParser.title,
|
||||||
|
removeAccents(this->htmlParser.title),
|
||||||
|
removeAccents(this->htmlParser.keywords),
|
||||||
|
removeAccents(this->htmlParser.dump));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this->indexNextPercentPost();
|
||||||
|
|
||||||
|
/* increment the offset and set returned value */
|
||||||
|
if (this->currentArticleOffset < this->lastArticleOffset) {
|
||||||
|
this->currentArticleOffset++;
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
this->stopIndexing();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,11 +20,14 @@ namespace kiwix {
|
||||||
class Indexer {
|
class Indexer {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Indexer(const string &zimFilePath, const string &xapianDirectoryPath);
|
Indexer(const string &zimFilePath);
|
||||||
virtual bool indexNextPercent(const bool &verbose = false) = 0;
|
bool indexNextPercent(const bool &verbose = false);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual void prepareIndexing() = 0;
|
virtual void indexNextPercentPre() = 0;
|
||||||
|
virtual void indexNextArticle(string &url, string &title, string &unaccentedTitle,
|
||||||
|
string &keywords, string &content) = 0;
|
||||||
|
virtual void indexNextPercentPost() = 0;
|
||||||
virtual void stopIndexing() = 0;
|
virtual void stopIndexing() = 0;
|
||||||
|
|
||||||
/* ZIM file handling */
|
/* ZIM file handling */
|
||||||
|
|
|
@ -4,7 +4,7 @@ namespace kiwix {
|
||||||
|
|
||||||
/* Constructor */
|
/* Constructor */
|
||||||
XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) :
|
XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) :
|
||||||
Indexer(zimFilePath, xapianDirectoryPath) {
|
Indexer(zimFilePath) {
|
||||||
|
|
||||||
/* Open the Xapian directory */
|
/* Open the Xapian directory */
|
||||||
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
|
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
|
||||||
|
@ -16,9 +16,7 @@ namespace kiwix {
|
||||||
indexer.set_stemmer(stemmer);
|
indexer.set_stemmer(stemmer);
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Read the stopwords file */
|
/* Stop words
|
||||||
/*
|
|
||||||
this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
|
||||||
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
|
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
|
||||||
this->stopper.add("ceci");
|
this->stopper.add("ceci");
|
||||||
while (stopWordsIterator != this->stopWords.end()) {
|
while (stopWordsIterator != this->stopWords.end()) {
|
||||||
|
@ -27,118 +25,43 @@ namespace kiwix {
|
||||||
}
|
}
|
||||||
indexer.set_stopper(&(this->stopper));
|
indexer.set_stopper(&(this->stopper));
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Prepare the indexation */
|
|
||||||
this->prepareIndexing();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Destructor */
|
void XapianIndexer::indexNextPercentPre() {
|
||||||
XapianIndexer::~XapianIndexer() {
|
|
||||||
this->stopIndexing();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Start indexing */
|
|
||||||
void XapianIndexer::prepareIndexing() {
|
|
||||||
|
|
||||||
/* Define a few values */
|
|
||||||
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
|
|
||||||
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
|
|
||||||
this->currentArticleOffset = this->firstArticleOffset;
|
|
||||||
|
|
||||||
/* Compute few things */
|
|
||||||
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
|
||||||
this->stepSize = (float)this->articleCount / (float)100;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Index next percent */
|
|
||||||
bool XapianIndexer::indexNextPercent(const bool &verbose) {
|
|
||||||
float thresholdOffset = this->currentArticleOffset + this->stepSize;
|
|
||||||
size_t found;
|
|
||||||
|
|
||||||
/* Check if we can start */
|
|
||||||
if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Begin the Xapian transation */
|
|
||||||
this->writableDatabase->begin_transaction(true);
|
this->writableDatabase->begin_transaction(true);
|
||||||
|
|
||||||
while(this->currentArticleOffset < thresholdOffset &&
|
|
||||||
this->currentArticleOffset < this->lastArticleOffset) {
|
|
||||||
|
|
||||||
zim::Article currentArticle;
|
|
||||||
Xapian::Document currentDocument;
|
|
||||||
|
|
||||||
/* Get next non redirect article */
|
|
||||||
do {
|
|
||||||
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
|
|
||||||
} while (this->currentArticleOffset++ &&
|
|
||||||
currentArticle.isRedirect() &&
|
|
||||||
this->currentArticleOffset != this->lastArticleOffset);
|
|
||||||
|
|
||||||
if (!currentArticle.isRedirect()) {
|
|
||||||
|
|
||||||
/* Index the content */
|
|
||||||
this->htmlParser.reset();
|
|
||||||
string content (currentArticle.getData().data(), currentArticle.getData().size());
|
|
||||||
|
|
||||||
/* The parser generate a lot of exceptions which should be avoided */
|
|
||||||
try {
|
|
||||||
this->htmlParser.parse_html(content, "UTF-8", true);
|
|
||||||
} catch (...) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If content does not have the noindex meta tag */
|
void XapianIndexer::indexNextArticle(string &url, string &title, string &unaccentedTitle,
|
||||||
/* Seems that the parser generates an exception in such case */
|
string &keywords, string &content) {
|
||||||
found = this->htmlParser.dump.find("NOINDEX");
|
|
||||||
|
|
||||||
if (found == string::npos) {
|
|
||||||
|
|
||||||
/* Put the data in the document */
|
/* Put the data in the document */
|
||||||
currentDocument.clear_values();
|
currentDocument.clear_values();
|
||||||
currentDocument.add_value(0, this->htmlParser.title);
|
currentDocument.add_value(0, title);
|
||||||
currentDocument.set_data(currentArticle.getLongUrl().c_str());
|
currentDocument.set_data(url);
|
||||||
indexer.set_document(currentDocument);
|
indexer.set_document(currentDocument);
|
||||||
|
|
||||||
/* Debug output */
|
|
||||||
if (verbose) {
|
|
||||||
std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Index the title */
|
/* Index the title */
|
||||||
if (!this->htmlParser.title.empty()) {
|
if (!unaccentedTitle.empty()) {
|
||||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.title),
|
indexer.index_text_without_positions(unaccentedTitle, 5);
|
||||||
((this->htmlParser.dump.size() / 100) + 1) /
|
|
||||||
countWords(this->htmlParser.title) );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Index the keywords */
|
/* Index the keywords */
|
||||||
if (!this->htmlParser.keywords.empty()) {
|
if (!keywords.empty()) {
|
||||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3);
|
indexer.index_text_without_positions(keywords, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Index the content */
|
/* Index the content */
|
||||||
if (!this->htmlParser.dump.empty()) {
|
if (!content.empty()) {
|
||||||
indexer.index_text_without_positions(removeAccents(this->htmlParser.dump));
|
indexer.index_text_without_positions(content);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* add to the database */
|
/* add to the database */
|
||||||
this->writableDatabase->add_document(currentDocument);
|
this->writableDatabase->add_document(currentDocument);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
void XapianIndexer::indexNextPercentPost() {
|
||||||
/* Flush and close Xapian transaction*/
|
/* Flush and close Xapian transaction*/
|
||||||
this->writableDatabase->commit_transaction();
|
this->writableDatabase->commit_transaction();
|
||||||
|
|
||||||
/* increment the offset and set returned value */
|
|
||||||
if (this->currentArticleOffset < this->lastArticleOffset) {
|
|
||||||
this->currentArticleOffset++;
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
this->stopIndexing();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
|
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
|
||||||
|
|
|
@ -22,18 +22,19 @@ namespace kiwix {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath);
|
XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath);
|
||||||
~XapianIndexer();
|
|
||||||
|
|
||||||
bool indexNextPercent(const bool &verbose = false);
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void prepareIndexing();
|
void indexNextPercentPre();
|
||||||
|
void indexNextArticle(string &url, string &title, string &unaccentedTitle,
|
||||||
|
string &keywords, string &content);
|
||||||
|
void indexNextPercentPost();
|
||||||
void stopIndexing();
|
void stopIndexing();
|
||||||
|
|
||||||
Xapian::WritableDatabase *writableDatabase;
|
Xapian::WritableDatabase *writableDatabase;
|
||||||
Xapian::Stem stemmer;
|
Xapian::Stem stemmer;
|
||||||
Xapian::SimpleStopper stopper;
|
Xapian::SimpleStopper stopper;
|
||||||
Xapian::TermGenerator indexer;
|
Xapian::TermGenerator indexer;
|
||||||
|
Xapian::Document currentDocument;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue