Commit 86326030 authored by Nelson Benítez León's avatar Nelson Benítez León Committed by Albert Astals Cid

add new 'IgnoreDiacritics' option to ::findText()

This makes possible that simple ascii search terms
can match on their accented and other diacritics
counterparts.

This option will be ignored if the search term is
not pure Ascii.

Issue #637
parent 90a3778a
......@@ -915,6 +915,7 @@ poppler_page_find_text_with_options (PopplerPage *page,
start_at_last,
false, //stopAtLast
options & POPPLER_FIND_CASE_SENSITIVE,
options & POPPLER_FIND_IGNORE_DIACRITICS,
backwards,
options & POPPLER_FIND_WHOLE_WORDS_ONLY,
&xMin, &yMin, &xMax, &yMax))
......
......@@ -154,6 +154,9 @@ typedef enum /*< flags >*/
* @POPPLER_FIND_CASE_SENSITIVE: do case sensitive search
* @POPPLER_FIND_BACKWARDS: search backwards
* @POPPLER_FIND_WHOLE_WORDS_ONLY: search only whole words
* @POPPLER_FIND_IGNORE_DIACRITICS: do diacritics insensitive search,
* i.e. ignore accents, umlauts, diaeresis,etc. while matching. This
* option will be ignored if the search term is not pure ascii. Since 0.73.
*
* Flags using while searching text in a page
*
......@@ -164,7 +167,8 @@ typedef enum /*< flags >*/
POPPLER_FIND_DEFAULT = 0,
POPPLER_FIND_CASE_SENSITIVE = 1 << 0,
POPPLER_FIND_BACKWARDS = 1 << 1,
POPPLER_FIND_WHOLE_WORDS_ONLY = 1 << 2
POPPLER_FIND_WHOLE_WORDS_ONLY = 1 << 2,
POPPLER_FIND_IGNORE_DIACRITICS = 1 << 3
} PopplerFindFlags;
typedef struct _PopplerDocument PopplerDocument;
......
......@@ -179,6 +179,14 @@
#define combMaxMidDelta 0.3
#define combMaxBaseDelta 0.4
namespace {
inline bool isAscii7 (Unicode uchar) {
return !(uchar & 0x80);
}
}
static int reorderText(Unicode *text, int len, UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode* u) {
char lre[8], rle[8], popdf[8], buf[8];
int lreLen = 0, rleLen = 0, popdfLen = 0, n;
......@@ -965,6 +973,9 @@ TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
normalized = nullptr;
normalized_len = 0;
normalized_idx = nullptr;
ascii_translation = nullptr;
ascii_len = 0;
ascii_idx = nullptr;
}
TextLine::~TextLine() {
......@@ -982,6 +993,10 @@ TextLine::~TextLine() {
gfree(normalized);
gfree(normalized_idx);
}
if (ascii_translation) {
gfree(ascii_translation);
gfree(ascii_idx);
}
}
void TextLine::addWord(TextWord *word) {
......@@ -3840,6 +3855,18 @@ bool TextPage::findText(Unicode *s, int len,
bool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax) {
return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast,
caseSensitive, false, backward, wholeWord,
xMin, yMin, xMax, yMax);
}
bool TextPage::findText(Unicode *s, int len,
bool startAtTop, bool stopAtBottom,
bool startAtLast, bool stopAtLast,
bool caseSensitive, bool ignoreDiacritics,
bool backward, bool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax) {
TextBlock *blk;
TextLine *line;
Unicode *s2, *txt, *reordered;
......@@ -3850,7 +3877,6 @@ bool TextPage::findText(Unicode *s, int len,
double xMin1, yMin1, xMax1, yMax1;
bool found;
if (rawOrder) {
return false;
}
......@@ -3862,10 +3888,21 @@ bool TextPage::findText(Unicode *s, int len,
// normalize the search string
s2 = unicodeNormalizeNFKC(reordered, len, &len, nullptr);
// convert the search string to uppercase
// if search string is not pure ascii then don't
// use ignoreDiacritics (as they won't match)
if (!caseSensitive) {
// convert the search string to uppercase
for (i = 0; i < len; ++i) {
s2[i] = unicodeToUpper(s2[i]);
if (ignoreDiacritics && !isAscii7(s2[i]))
ignoreDiacritics = false;
}
} else if (ignoreDiacritics) {
for (i = 0; i < len; ++i) {
if (!isAscii7(s2[i])) {
ignoreDiacritics = false;
break;
}
}
}
......@@ -3938,16 +3975,36 @@ bool TextPage::findText(Unicode *s, int len,
true);
// convert the line to uppercase
m = line->normalized_len;
if (ignoreDiacritics) {
if (!line->ascii_translation)
unicodeToAscii7(line->normalized,
line->normalized_len,
&line->ascii_translation,
&line->ascii_len,
line->normalized_idx,
&line->ascii_idx);
if (line->ascii_len)
m = line->ascii_len;
else
ignoreDiacritics = false;
}
if (!caseSensitive) {
if (m > txtSize) {
txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
txtSize = m;
}
for (k = 0; k < m; ++k) {
txt[k] = unicodeToUpper(line->normalized[k]);
}
if (ignoreDiacritics)
txt[k] = unicodeToUpper(line->ascii_translation[k]);
else
txt[k] = unicodeToUpper(line->normalized[k]);
}
} else {
txt = line->normalized;
if (ignoreDiacritics)
txt = line->ascii_translation;
else
txt = line->normalized;
}
// search each position in this line
......@@ -3970,8 +4027,14 @@ bool TextPage::findText(Unicode *s, int len,
// where s2 matches a subsequence of a compatibility equivalence
// decomposition, highlight the entire glyph, since we don't know
// the internal layout of subglyph components
int normStart = line->normalized_idx[j];
int normAfterEnd = line->normalized_idx[j + len - 1] + 1;
int normStart, normAfterEnd;
if (ignoreDiacritics) {
normStart = line->ascii_idx[j];
normAfterEnd = line->ascii_idx[j + len - 1] + 1;
} else {
normStart = line->normalized_idx[j];
normAfterEnd = line->normalized_idx[j + len - 1] + 1;
}
switch (line->rot) {
case 0:
xMin1 = line->edge[normStart];
......
......@@ -340,6 +340,9 @@ private:
Unicode *normalized; // normalized form of Unicode text
int normalized_len; // number of normalized Unicode chars
int *normalized_idx; // indices of normalized chars into Unicode text
Unicode *ascii_translation; // ascii translation from the normalized text
int ascii_len; // length of ascii translation text
int *ascii_idx; // indices of ascii chars into Unicode text of line
friend class TextLineFrag;
friend class TextBlock;
......@@ -605,6 +608,18 @@ public:
double *xMin, double *yMin,
double *xMax, double *yMax);
// Adds new parameter ignoreDiacritics, which will do diacritics
// insensitive search, i.e. ignore accents, umlauts, diaeresis,etc.
// while matching. This option will be ignored if <s> contains characters
// which are not pure ascii.
bool findText(Unicode *s, int len,
bool startAtTop, bool stopAtBottom,
bool startAtLast, bool stopAtLast,
bool caseSensitive, bool ignoreDiacritics,
bool backward, bool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax);
// Get the text which is inside the specified rectangle.
GooString *getText(double xMin, double yMin,
double xMax, double yMax);
......
......@@ -27,6 +27,8 @@
#include "goo/gmem.h"
#include "PDFDocEncoding.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
#include "UTF.h"
#include "UnicodeMapFuncs.h"
#include <algorithm>
......@@ -416,3 +418,60 @@ char *utf16ToUtf8(const uint16_t *utf16, int *len)
utf16ToUtf8(utf16, utf8);
return utf8;
}
struct Ascii7Map
{
UnicodeMap *d;
Ascii7Map()
{
GooString enc("ASCII7");
d = globalParams->getUnicodeMap(&enc);
}
};
void unicodeToAscii7(Unicode *in, int len, Unicode **ucs4_out,
int *out_len, int *in_idx, int **indices)
{
static Ascii7Map uMap;
int *idx = nullptr;
if (!len) {
*ucs4_out = nullptr;
*out_len = 0;
return;
}
if (indices) {
if (!in_idx)
indices = nullptr;
else
idx = (int *) gmallocn(len * 2 + 1, sizeof(int));
}
GooString gstr;
char buf[8]; // 8 is enough for mapping an unicode char to a string
int i, n, k;
for (i = k = 0; i < len; ++i) {
n = uMap.d->mapUnicode(in[i], buf, sizeof(buf));
if (!n) {
// the Unicode char could not be converted to ascii7 counterpart
// so just fill with a non-printable ascii char
buf[0] = 31;
n = 1;
}
gstr.append(buf, n);
if (indices) {
for (; n > 0; n--)
idx[k++] = in_idx[i];
}
}
*out_len = TextStringToUCS4(&gstr, ucs4_out);
if (indices) {
idx[k] = in_idx[len];
*indices = idx;
}
}
......@@ -75,4 +75,17 @@ int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int ma
// Allocate utf8 string and convert utf16 into it.
char *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr);
// Convert a UCS-4 string to pure ASCII (7bit)
// in - UCS-4 string bytes
// len - number of UCS-4 characters
// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
// out_len - number of UCS-4 characters in ucs4_out.
// in_idx - if not NULL, the int array returned by the out fourth parameter of
// unicodeNormalizeNFKC() function. Optional, needed for @indices out parameter.
// indices - if not NULL, @indices is assigned the location of a newly-allocated array
// of length @out_len + 1, for each character in the ascii string giving the index
// of the corresponding character in the text of the line (thanks to this info
// being passed in @in_idx parameter).
void unicodeToAscii7(Unicode *in, int len, Unicode **ucs4_out, int *out_len, int *in_idx, int **indices);
#endif
......@@ -48,8 +48,8 @@ public:
static Link* convertLinkActionToLink(::LinkAction * a, DocumentData *parentDoc, const QRectF &linkArea);
TextPage *prepareTextSearch(const QString &text, Page::Rotation rotate, QVector<Unicode> *u);
bool performSingleTextSearch(TextPage* textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords);
QList<QRectF> performMultipleTextSearch(TextPage* textPage, QVector<Unicode> &u, bool sCase, bool sWords);
bool performSingleTextSearch(TextPage* textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics);
QList<QRectF> performMultipleTextSearch(TextPage* textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics);
};
}
......
......@@ -394,28 +394,28 @@ inline TextPage *PageData::prepareTextSearch(const QString &text, Page::Rotation
return textPage;
}
inline bool PageData::performSingleTextSearch(TextPage* textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords)
inline bool PageData::performSingleTextSearch(TextPage* textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics = false)
{
if (direction == Page::FromTop)
return textPage->findText( u.data(), u.size(),
true, true, false, false, sCase, false, sWords, &sLeft, &sTop, &sRight, &sBottom );
true, true, false, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom );
else if ( direction == Page::NextResult )
return textPage->findText( u.data(), u.size(),
false, true, true, false, sCase, false, sWords, &sLeft, &sTop, &sRight, &sBottom );
false, true, true, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom );
else if ( direction == Page::PreviousResult )
return textPage->findText( u.data(), u.size(),
false, true, true, false, sCase, true, sWords, &sLeft, &sTop, &sRight, &sBottom );
false, true, true, false, sCase, sDiacritics, true, sWords, &sLeft, &sTop, &sRight, &sBottom );
return false;
}
inline QList<QRectF> PageData::performMultipleTextSearch(TextPage* textPage, QVector<Unicode> &u, bool sCase, bool sWords)
inline QList<QRectF> PageData::performMultipleTextSearch(TextPage* textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics = false)
{
QList<QRectF> results;
double sLeft = 0.0, sTop = 0.0, sRight = 0.0, sBottom = 0.0;
while(textPage->findText( u.data(), u.size(),
false, true, true, false, sCase, false, sWords, &sLeft, &sTop, &sRight, &sBottom ))
false, true, true, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom ))
{
QRectF result;
......@@ -718,11 +718,12 @@ bool Page::search(const QString &text, double &sLeft, double &sTop, double &sRig
{
const bool sCase = flags.testFlag(IgnoreCase) ? false : true;
const bool sWords = flags.testFlag(WholeWords) ? true : false;
const bool sDiacritics = flags.testFlag(IgnoreDiacritics) ? true : false;
QVector<Unicode> u;
TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u);
const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, sWords);
const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, sWords, sDiacritics);
textPage->decRefCnt();
......@@ -747,11 +748,12 @@ QList<QRectF> Page::search(const QString &text, SearchFlags flags, Rotation rota
{
const bool sCase = flags.testFlag(IgnoreCase) ? false : true;
const bool sWords = flags.testFlag(WholeWords) ? true : false;
const bool sDiacritics = flags.testFlag(IgnoreDiacritics) ? true : false;
QVector<Unicode> u;
TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u);
const QList<QRectF> results = m_page->performMultipleTextSearch(textPage, u, sCase, sWords);
const QList<QRectF> results = m_page->performMultipleTextSearch(textPage, u, sCase, sWords, sDiacritics);
textPage->decRefCnt();
......
......@@ -739,7 +739,10 @@ delete it;
{
NoSearchFlags = 0x00000000, ///< since 0.63
IgnoreCase = 0x00000001, ///< Case differences are ignored
WholeWords = 0x00000002 ///< Only whole words are matched
WholeWords = 0x00000002, ///< Only whole words are matched
IgnoreDiacritics = 0x00000004 ///< Diacritic differences (eg. accents, umlauts, diaeresis) are ignored. \since 0.73
///< This option will have no effect if the search term contains characters which
///< are not pure ascii.
};
Q_DECLARE_FLAGS( SearchFlags, SearchFlag )
......
......@@ -11,6 +11,7 @@ private slots:
void bug7063();
void testNextAndPrevious();
void testWholeWordsOnly();
void testIgnoreDiacritics();
};
void TestSearch::bug7063()
......@@ -171,6 +172,63 @@ void TestSearch::testWholeWordsOnly()
QCOMPARE( page->search(QStringLiteral("Own"), left, top, right, bottom, direction, mode3), false );
}
void TestSearch::testIgnoreDiacritics()
{
QScopedPointer< Poppler::Document > document(Poppler::Document::load(TESTDATADIR "/unittestcases/Issue637.pdf"));
QVERIFY( document );
QScopedPointer< Poppler::Page > page(document->page(0));
QVERIFY( page );
const Poppler::Page::SearchDirection direction = Poppler::Page::FromTop;
const Poppler::Page::SearchFlags mode0 = nullptr;
const Poppler::Page::SearchFlags mode1 = Poppler::Page::IgnoreDiacritics;
const Poppler::Page::SearchFlags mode2 = Poppler::Page::IgnoreDiacritics | Poppler::Page::IgnoreCase;
const Poppler::Page::SearchFlags mode3 = Poppler::Page::IgnoreDiacritics | Poppler::Page::IgnoreCase | Poppler::Page::WholeWords;
const Poppler::Page::SearchFlags mode4 = Poppler::Page::IgnoreCase | Poppler::Page::WholeWords;
double left, top, right, bottom;
// Test pdf (Issue637.pdf) just contains the following three lines:
// La cigüeña voló sobre nuestras cabezas.
// La cigogne a survolé nos têtes.
// Der Storch flog über unsere Köpfe hinweg.
QCOMPARE( page->search(QStringLiteral("ciguena"), left, top, right, bottom, direction, mode0), false );
QCOMPARE( page->search(QStringLiteral("Ciguena"), left, top, right, bottom, direction, mode1), false );
QCOMPARE( page->search(QStringLiteral("ciguena"), left, top, right, bottom, direction, mode1), true );
QCOMPARE( page->search(QString::fromUtf8("cigüeña"), left, top, right, bottom, direction, mode1), true ); //clazy:exclude=qstring-allocations
QCOMPARE( page->search(QString::fromUtf8("cigüena"), left, top, right, bottom, direction, mode1), false ); //clazy:exclude=qstring-allocations
QCOMPARE( page->search(QString::fromUtf8("Cigüeña"), left, top, right, bottom, direction, mode1), false ); //clazy:exclude=qstring-allocations
QCOMPARE( page->search(QStringLiteral("Ciguena"), left, top, right, bottom, direction, mode2), true );
QCOMPARE( page->search(QStringLiteral("ciguena"), left, top, right, bottom, direction, mode2), true );
QCOMPARE( page->search(QStringLiteral("Ciguena"), left, top, right, bottom, direction, mode3), true );
QCOMPARE( page->search(QStringLiteral("ciguena"), left, top, right, bottom, direction, mode3), true );
QCOMPARE( page->search(QString::fromUtf8("cigüeña"), left, top, right, bottom, direction, mode4), true ); //clazy:exclude=qstring-allocations
QCOMPARE( page->search(QString::fromUtf8("Cigüeña"), left, top, right, bottom, direction, mode4), true ); //clazy:exclude=qstring-allocations
QCOMPARE( page->search(QString::fromUtf8("cigüena"), left, top, right, bottom, direction, mode4), false ); //clazy:exclude=qstring-allocations
QCOMPARE( page->search(QStringLiteral("Ciguena"), left, top, right, bottom, direction, mode4), false );
QCOMPARE( page->search(QStringLiteral("kopfe"), left, top, right, bottom, direction, mode2), true );
QCOMPARE( page->search(QStringLiteral("kopfe"), left, top, right, bottom, direction, mode3), true );
QCOMPARE( page->search(QStringLiteral("uber"), left, top, right, bottom, direction, mode0), false );
QCOMPARE( page->search(QStringLiteral("uber"), left, top, right, bottom, direction, mode1), true );
QCOMPARE( page->search(QStringLiteral("uber"), left, top, right, bottom, direction, mode2), true );
QCOMPARE( page->search(QStringLiteral("uber"), left, top, right, bottom, direction, mode3), true );
QCOMPARE( page->search(QStringLiteral("vole"), left, top, right, bottom, direction, mode2), true );
QCOMPARE( page->search(QStringLiteral("vole"), left, top, right, bottom, direction, mode3), false );
QCOMPARE( page->search(QStringLiteral("survole"), left, top, right, bottom, direction, mode3), true );
QCOMPARE( page->search(QStringLiteral("tete"), left, top, right, bottom, direction, mode3), false );
QCOMPARE( page->search(QStringLiteral("tete"), left, top, right, bottom, direction, mode2), true );
QCOMPARE( page->search(QStringLiteral("La Ciguena Volo"), left, top, right, bottom, direction, mode2), true );
QCOMPARE( page->search(QStringLiteral("Survole Nos Tetes"), left, top, right, bottom, direction, mode2), true );
QCOMPARE( page->search(QStringLiteral("Uber Unsere Kopfe"), left, top, right, bottom, direction, mode2), true );
}
QTEST_GUILESS_MAIN(TestSearch)
#include "check_search.moc"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment