mirror of https://github.com/kiwix/libkiwix.git
+ remove clucene source files
This commit is contained in:
parent
840442fccd
commit
fc6254090a
|
@ -1,86 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "cluceneIndexer.h"
|
|
||||||
|
|
||||||
namespace kiwix {
|
|
||||||
|
|
||||||
TCHAR buffer[MAX_BUFFER_SIZE];
|
|
||||||
|
|
||||||
CluceneIndexer::CluceneIndexer() {
|
|
||||||
}
|
|
||||||
|
|
||||||
void CluceneIndexer::indexingPrelude(const string indexPath) {
|
|
||||||
this->dir = FSDirectory::getDirectory(indexPath.c_str(), true);
|
|
||||||
this->writer = new IndexWriter(this->dir, &analyzer, true);
|
|
||||||
this->writer->setUseCompoundFile(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CluceneIndexer::index(const string &url,
|
|
||||||
const string &title,
|
|
||||||
const string &unaccentedTitle,
|
|
||||||
const string &keywords,
|
|
||||||
const string &content,
|
|
||||||
const string &snippet,
|
|
||||||
const string &size,
|
|
||||||
const string &wordCount) {
|
|
||||||
|
|
||||||
Document doc;
|
|
||||||
|
|
||||||
/* Not indexed but stored */
|
|
||||||
//STRCPY_AtoT(buffer, title.c_str(), MAX_BUFFER_SIZE);
|
|
||||||
::mbstowcs(buffer,title.c_str(),MAX_BUFFER_SIZE);
|
|
||||||
doc.add(*_CLNEW Field(_T("title"), buffer, Field::STORE_YES | Field::INDEX_UNTOKENIZED)); // TODO: Why store, not analyzed? what is utitle?
|
|
||||||
|
|
||||||
//STRCPY_AtoT(buffer, url.c_str(), MAX_BUFFER_SIZE);
|
|
||||||
::mbstowcs(buffer,url.c_str(),MAX_BUFFER_SIZE);
|
|
||||||
doc.add(*_CLNEW Field(_T("url"), buffer, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
|
|
||||||
|
|
||||||
/* indexed but not stored */
|
|
||||||
//STRCPY_AtoT(buffer, unaccentedTitle.c_str(), MAX_BUFFER_SIZE);
|
|
||||||
::mbstowcs(buffer,unaccentedTitle.c_str(),MAX_BUFFER_SIZE);
|
|
||||||
Field *titleField = new Field(_T("utitle"), buffer, Field::STORE_NO | Field::INDEX_TOKENIZED);
|
|
||||||
titleField->setBoost(getTitleBoostFactor(content.size()));
|
|
||||||
doc.add(*titleField);
|
|
||||||
|
|
||||||
//STRCPY_AtoT(buffer, keywords.c_str(), MAX_BUFFER_SIZE);
|
|
||||||
::mbstowcs(buffer,keywords.c_str(),MAX_BUFFER_SIZE);
|
|
||||||
Field *keywordsField = new Field(_T("keywords"), buffer, Field::STORE_NO | Field::INDEX_TOKENIZED);
|
|
||||||
keywordsField->setBoost(keywordsBoostFactor);
|
|
||||||
doc.add(*keywordsField);
|
|
||||||
|
|
||||||
//STRCPY_AtoT(buffer, content.c_str(), MAX_BUFFER_SIZE);
|
|
||||||
::mbstowcs(buffer,content.c_str(),MAX_BUFFER_SIZE);
|
|
||||||
doc.add(*_CLNEW Field(_T("content"), buffer, Field::STORE_NO | Field::INDEX_TOKENIZED)); // TODO: TermVectors if you want to highlight
|
|
||||||
|
|
||||||
/* Add the document to the index */
|
|
||||||
this->writer->addDocument(&doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CluceneIndexer::flush() {
|
|
||||||
}
|
|
||||||
|
|
||||||
void CluceneIndexer::indexingPostlude() {
|
|
||||||
this->writer->setUseCompoundFile(true);
|
|
||||||
this->writer->optimize();
|
|
||||||
this->writer->close();
|
|
||||||
delete this->writer;
|
|
||||||
_CLDECDELETE(this->dir);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,65 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef KIWIX_CLUCENE_INDEXER_H
|
|
||||||
#define KIWIX_CLUCENE_INDEXER_H
|
|
||||||
|
|
||||||
#include <CLucene.h>
|
|
||||||
#include <assert.h>
|
|
||||||
#include "indexer.h"
|
|
||||||
|
|
||||||
#define MAX_BUFFER_SIZE 4200000
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
using namespace lucene::analysis;
|
|
||||||
using namespace lucene::index;
|
|
||||||
using namespace lucene::document;
|
|
||||||
using namespace lucene::queryParser;
|
|
||||||
using namespace lucene::search;
|
|
||||||
using namespace lucene::store;
|
|
||||||
|
|
||||||
namespace kiwix {
|
|
||||||
|
|
||||||
class CluceneIndexer : public Indexer {
|
|
||||||
|
|
||||||
public:
|
|
||||||
CluceneIndexer();
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void indexingPrelude(const string indexPath);
|
|
||||||
void index(const string &url,
|
|
||||||
const string &title,
|
|
||||||
const string &unaccentedTitle,
|
|
||||||
const string &keywords,
|
|
||||||
const string &content,
|
|
||||||
const string &snippet,
|
|
||||||
const string &size,
|
|
||||||
const string &wordCount);
|
|
||||||
void flush();
|
|
||||||
void indexingPostlude();
|
|
||||||
|
|
||||||
FSDirectory* dir;
|
|
||||||
IndexWriter* writer;
|
|
||||||
lucene::analysis::standard::StandardAnalyzer analyzer;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -1,118 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "cluceneSearcher.h"
|
|
||||||
|
|
||||||
namespace kiwix {
|
|
||||||
|
|
||||||
IndexSearcher* CluceneSearcher::searcher = NULL;
|
|
||||||
Directory* CluceneSearcher::dir = NULL;
|
|
||||||
|
|
||||||
TCHAR buffer[MAX_BUFFER_SIZE];
|
|
||||||
|
|
||||||
/* Constructor */
|
|
||||||
CluceneSearcher::CluceneSearcher(const string &cluceneDirectoryPath)
|
|
||||||
: kiwix::Searcher() {
|
|
||||||
if (searcher == NULL)
|
|
||||||
this->openIndex(cluceneDirectoryPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Open Clucene readable database */
|
|
||||||
void CluceneSearcher::openIndex(const string &directoryPath) {
|
|
||||||
cout << "Open index folder at " << directoryPath << endl;
|
|
||||||
dir = FSDirectory::getDirectory(directoryPath.c_str(), false);
|
|
||||||
searcher = new IndexSearcher(dir);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Close Clucene writable database */
|
|
||||||
void CluceneSearcher::closeIndex() {
|
|
||||||
}
|
|
||||||
|
|
||||||
void CluceneSearcher::terminate()
|
|
||||||
{
|
|
||||||
dir->close();
|
|
||||||
searcher->close();
|
|
||||||
delete searcher;
|
|
||||||
_CLLDECDELETE(dir);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string toString(const TCHAR* s){
|
|
||||||
/* Comment out for CLucene 2.3
|
|
||||||
int32_t len = _tcslen(s);
|
|
||||||
char* buf = new char[len+1];
|
|
||||||
STRCPY_WtoA(buf,s,len+1);
|
|
||||||
string ret = buf;
|
|
||||||
delete[] buf;
|
|
||||||
*/
|
|
||||||
return "";
|
|
||||||
// return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Search strings in the database */
|
|
||||||
void CluceneSearcher::searchInIndex(string &search, const unsigned int resultStart,
|
|
||||||
const unsigned int resultEnd, const bool verbose) {
|
|
||||||
|
|
||||||
// Parse query
|
|
||||||
/* Comment out for Clucene 2.3
|
|
||||||
lucene::analysis::standard::StandardAnalyzer* analyzer = new lucene::analysis::standard::StandardAnalyzer();
|
|
||||||
QueryParser* parser = new QueryParser(_T("content"), analyzer);
|
|
||||||
STRCPY_AtoT(buffer, search.c_str(), MAX_BUFFER_SIZE);
|
|
||||||
|
|
||||||
Query* query = parser->parse(buffer);
|
|
||||||
delete parser;
|
|
||||||
delete analyzer;
|
|
||||||
|
|
||||||
cout << "Query: " << search << endl;
|
|
||||||
wcout << "Buffer: " << buffer << endl;
|
|
||||||
|
|
||||||
if (query == NULL){
|
|
||||||
cout << "Hits length:0 (null query)" << endl;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const wchar_t* querystring = query->toString();
|
|
||||||
wcout << L"Query2string: " << querystring << endl;
|
|
||||||
delete[] querystring;
|
|
||||||
|
|
||||||
// Search
|
|
||||||
Hits* hits = searcher->search(query);
|
|
||||||
cout << "Hits length:" << hits->length() << endl;
|
|
||||||
|
|
||||||
for (int32_t i=0; i < hits->length() && i<10; i++) {
|
|
||||||
Document* d = &hits->doc(i);
|
|
||||||
_tprintf(_T("#%d. %s, url: %s (score: %f)\n"),
|
|
||||||
i + 1, d->get(_T("title")), d->get(_T("url")),
|
|
||||||
hits->score(i));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
Result result;
|
|
||||||
result.url = doc.get_data();
|
|
||||||
result.title = doc.get_value(0);
|
|
||||||
result.score = i.get_percent();
|
|
||||||
|
|
||||||
this->results.push_back(result);
|
|
||||||
*/
|
|
||||||
|
|
||||||
// delete hits;
|
|
||||||
// delete query;
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,61 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef KIWIX_CLUCENE_SEARCHER_H
|
|
||||||
#define KIWIX_CLUCENE_SEARCHER_H
|
|
||||||
|
|
||||||
#include <CLucene.h>
|
|
||||||
#include <assert.h>
|
|
||||||
#include <CLucene/queryParser/MultiFieldQueryParser.h>
|
|
||||||
#include "searcher.h"
|
|
||||||
|
|
||||||
#define MAX_BUFFER_SIZE 4200000
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace lucene::analysis;
|
|
||||||
using namespace lucene::index;
|
|
||||||
using namespace lucene::document;
|
|
||||||
using namespace lucene::queryParser;
|
|
||||||
using namespace lucene::search;
|
|
||||||
using namespace lucene::store;
|
|
||||||
|
|
||||||
namespace kiwix {
|
|
||||||
|
|
||||||
class CluceneSearcher : public Searcher {
|
|
||||||
|
|
||||||
public:
|
|
||||||
CluceneSearcher(const string &cluceneDirectoryPath);
|
|
||||||
|
|
||||||
void searchInIndex(string &search, const unsigned int resultStart,
|
|
||||||
const unsigned int resultEnd, const bool verbose=false);
|
|
||||||
|
|
||||||
static void terminate();
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void closeIndex();
|
|
||||||
void openIndex(const string &cluceneDirectoryPath);
|
|
||||||
|
|
||||||
static IndexSearcher* searcher;
|
|
||||||
static Directory* dir;
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
Loading…
Reference in New Issue