mirror of https://github.com/kiwix/libkiwix.git
+ fix the problem with failing accented searches on windows
This commit is contained in:
parent
624547d8ef
commit
19ac4741e6
|
@ -31,13 +31,10 @@ namespace kiwix {
|
|||
/* Create the enquire object */
|
||||
Xapian::Enquire enquire(this->readableDatabase);
|
||||
|
||||
/* lowercase the search pattern */
|
||||
std::transform(search.begin(), search.end(), search.begin(), ::tolower);
|
||||
|
||||
/* Create the query term vector */
|
||||
/* I have the doublequote " because bug ID: 2939690 */
|
||||
std::vector<std::string> queryTerms = split(removeAccents(search), " #@%$0/\\_-*()[]{},;:\"´`'");
|
||||
|
||||
|
||||
/* Create query object */
|
||||
Xapian::Query query(Xapian::Query::OP_OR, queryTerms.begin(), queryTerms.end());
|
||||
|
||||
|
|
|
@ -1,69 +1,14 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2003, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
#include "unaccent.h"
|
||||
|
||||
const char UnaccentTransliterator::fgClassID = 0;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
UnaccentTransliterator::UnaccentTransliterator() :
|
||||
normalizer("", UNORM_NFD),
|
||||
Transliterator("Unaccent", 0) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
UnaccentTransliterator::~UnaccentTransliterator() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove accents from a character using Normalizer.
|
||||
*/
|
||||
UChar UnaccentTransliterator::unaccent(UChar c) const {
|
||||
UnicodeString str(c);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnaccentTransliterator* t = (UnaccentTransliterator*)this;
|
||||
|
||||
t->normalizer.setText(str, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return c;
|
||||
}
|
||||
return (UChar) t->normalizer.next();
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement Transliterator API
|
||||
*/
|
||||
void UnaccentTransliterator::handleTransliterate(Replaceable& text,
|
||||
UTransPosition& index,
|
||||
UBool incremental) const {
|
||||
UnicodeString str("a");
|
||||
while (index.start < index.limit) {
|
||||
UChar c = text.charAt(index.start);
|
||||
UChar d = unaccent(c);
|
||||
if (c != d) {
|
||||
str.setCharAt(0, d);
|
||||
text.handleReplaceBetween(index.start, index.start+1, str);
|
||||
}
|
||||
index.start++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove accents from a String */
|
||||
UnaccentTransliterator unaccent;
|
||||
UnicodeString unicodeAccentedString;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator *trans = Transliterator::createInstance("Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status);
|
||||
|
||||
std::string &removeAccents(std::string &text) {
|
||||
unicodeAccentedString = UnicodeString(text.c_str());
|
||||
unaccent.transliterate(unicodeAccentedString);
|
||||
ucnv_setDefaultName("UTF-8");
|
||||
UnicodeString ustring = UnicodeString(text.c_str());
|
||||
trans->transliterate(ustring);
|
||||
text.clear();
|
||||
unicodeAccentedString.toUTF8String(text);
|
||||
ustring.toUTF8String(text);
|
||||
return text;
|
||||
}
|
||||
|
||||
|
@ -72,7 +17,7 @@ void printStringInHexadecimal(UnicodeString s) {
|
|||
for (int i=0; i<s.length(); i++) {
|
||||
char c = (char)((s.getTerminatedBuffer())[i]);
|
||||
if (c & 0x80)
|
||||
std::cout << (c & 0xff) << " ";
|
||||
std::cout << (c & 0xffff) << " ";
|
||||
else
|
||||
std::cout << c << " ";
|
||||
}
|
||||
|
@ -83,7 +28,7 @@ void printStringInHexadecimal(const char *s) {
|
|||
std::cout << std::showbase << std::hex;
|
||||
for (char const* pc = s; *pc; ++pc) {
|
||||
if (*pc & 0x80)
|
||||
std::cout << (*pc & 0xff);
|
||||
std::cout << (*pc & 0xffff);
|
||||
else
|
||||
std::cout << *pc;
|
||||
std::cout << ' ';
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2003, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
#ifndef KIWIX_UNACCENT_H
|
||||
#define KIWIX_UNACCENT_H
|
||||
|
||||
#include <unicode/translit.h>
|
||||
#include <unicode/normlzr.h>
|
||||
|
@ -10,92 +7,14 @@
|
|||
#include <unicode/rep.h>
|
||||
#include <unicode/translit.h>
|
||||
#include <unicode/uniset.h>
|
||||
#include <unicode/ustring.h>
|
||||
#include <unicode/ucnv.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
//#define U_CHARSET_IS_UTF8 1
|
||||
|
||||
class UnaccentTransliterator : public Transliterator {
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
UnaccentTransliterator();
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~UnaccentTransliterator();
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Implement Transliterator API
|
||||
*/
|
||||
virtual void handleTransliterate(Replaceable& text,
|
||||
UTransPosition& index,
|
||||
UBool incremental) const;
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Unaccent a single character using normalizer.
|
||||
*/
|
||||
UChar unaccent(UChar c) const;
|
||||
|
||||
Normalizer normalizer;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Return the class ID for this class. This is useful only for
|
||||
* comparing to a return value from getDynamicClassID(). For example:
|
||||
* <pre>
|
||||
* . Base* polymorphic_pointer = createPolymorphicObject();
|
||||
* . if (polymorphic_pointer->getDynamicClassID() ==
|
||||
* . Derived::getStaticClassID()) ...
|
||||
* </pre>
|
||||
* @return The class ID for all objects of this class.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
static inline UClassID getStaticClassID(void) { return (UClassID)&fgClassID; };
|
||||
|
||||
/**
|
||||
* Returns a unique class ID <b>polymorphically</b>. This method
|
||||
* is to implement a simple version of RTTI, since not all C++
|
||||
* compilers support genuine RTTI. Polymorphic operator==() and
|
||||
* clone() methods call this method.
|
||||
*
|
||||
* <p>Concrete subclasses of Transliterator that wish clients to
|
||||
* be able to identify them should implement getDynamicClassID()
|
||||
* and also a static method and data member:
|
||||
*
|
||||
* <pre>
|
||||
* static UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
* static char fgClassID;
|
||||
* </pre>
|
||||
*
|
||||
* Subclasses that do not implement this method will have a
|
||||
* dynamic class ID of Transliterator::getStatisClassID().
|
||||
*
|
||||
* @return The class ID for this object. All objects of a given
|
||||
* class have the same class ID. Objects of other classes have
|
||||
* different class IDs.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); };
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Class identifier for subclasses of Transliterator that do not
|
||||
* define their class (anonymous subclasses).
|
||||
*/
|
||||
static const char fgClassID;
|
||||
};
|
||||
|
||||
std::string &removeAccents(std::string &text);
|
||||
void printStringInHexadecimal(const char *s);
|
||||
void printStringInHexadecimal(UnicodeString s);
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue