Commit e3fed321 authored by Nelson Benítez León's avatar Nelson Benítez León 🌴 Committed by Albert Astals Cid
Browse files

find, glib: Enhance find to support multi-line matching

On the backend side, adds 3 new parameters to TextPage::findText(),
one bool to enable the feature, one out PDFRectangle to store
the part of the match that falls on the next line, and one out
bool to inform whether hyphen was present and ignored at end of
the previous match part.

For the glib binding, this extends the public PopplerRectangle
struct by new members to hold additional information about
whether the rectangle belongs to a group of rectangles for the
same match, and whether a hyphen was ignored at the end of the
line. Since PopplerRectangle is public ABI, this is done by making
the public PopplerRectangle API return the enlarged struct, and
internally casting to the new struct when required, the new
members are accessible only via accessor functions.

For Qt5 Qt6 bindings, this commit only implements the new flag
Poppler::Page::AcrossLines (but no new function and no new
return data type) and if this flag is passed, the returned
list of rectangles will also include rectangles for the
second part of across-line matches.

This minimum Qt bindings still allows for the creation of
tests for this feature (using the Qt test framework) which
this commit *do includes*. But a more complete binding (with
a new return type that includes 'matchContinued' and 'ignoredHypen'
boolean fields) is left to do for qt backend maintainers
if they want to use this feature in eg. Okular.

So, as mentioned, this commit incorporates tests for the
implemented across-line matching feature, and the tests do
also check for two included aspects of this feature, which are:

 - Ignoring hyphen character while matching when 1) it's the
   last character of the line and 2) its corresponding matching
   character in the search term is not an hyphen too.

 - Any whitespace characters in the search term will be allowed
   to match on the logic position where the lines split (i.e. what
   would normally be the newline character in a text file, but
   PDF text does not include newline characters between lines).

Regarding the enhancement to findText() function which implements
matching across lines, just two more notes:

 - It won't match on text spanning more than two lines, i.e. it
   only matches text spanning from end of one line to start of
   next line.

 - It does not supports finding backwards, if findText() receives
   both <backward> and <matchAcrossLines> parameters as true, it
   will ignore the <matchAcrossLines> parameter. Implementing
   <matchAcrossLines> with backwards direction is possible, but
   it will make an already complex function like findText() to be
   even more complex, for little gain as eg. Evince does not even
   use the <backward> parameter of findText().

Fixes poppler issues #744 and #755
Related Evince issue https://gitlab.gnome.org/GNOME/evince/issues/333
parent 60fec726
Pipeline #308239 passed with stages
in 8 minutes and 40 seconds
......@@ -85,6 +85,23 @@ static void pgd_find_update_progress(PgdFindDemo *demo, gint scanned)
g_free(str);
}
static void pgd_find_append_match(PgdFindDemo *demo, GtkTreeModel *model, GtkTreeIter *iter_child, PopplerRectangle *rect, int match_id)
{
char *x1, *y1, *x2, *y2, *str;
str = g_strdup_printf("Match %d", match_id + 1);
x1 = g_strdup_printf("%.2f", rect->x1);
y1 = g_strdup_printf("%.2f", rect->y1);
x2 = g_strdup_printf("%.2f", rect->x2);
y2 = g_strdup_printf("%.2f", rect->y2);
gtk_tree_store_set(GTK_TREE_STORE(model), iter_child, TITLE_COLUMN, str, X1_COLUMN, x1, Y1_COLUMN, y1, X2_COLUMN, x2, Y2_COLUMN, y2, VISIBLE_COLUMN, TRUE, PAGE_COLUMN, demo->page_index, PAGE_RECT, rect, -1);
g_free(str);
g_free(x1);
g_free(y1);
g_free(x2);
g_free(y2);
g_object_weak_ref(G_OBJECT(model), (GWeakNotify)poppler_rectangle_free, rect);
}
static gboolean pgd_find_find_text(PgdFindDemo *demo)
{
PopplerPage *page;
......@@ -103,46 +120,31 @@ static gboolean pgd_find_find_text(PgdFindDemo *demo)
matches = poppler_page_find_text_with_options(page, gtk_entry_get_text(GTK_ENTRY(demo->entry)), demo->options);
g_timer_stop(timer);
if (matches) {
GtkTreeIter iter;
GtkTreeIter iter, iter_child;
gchar *str;
GList *l;
gdouble height;
gint n_match = 0;
str = g_strdup_printf("%d matches found on page %d in %.4f seconds", g_list_length(matches), demo->page_index + 1, g_timer_elapsed(timer, NULL));
gtk_tree_store_append(GTK_TREE_STORE(model), &iter, NULL);
gtk_tree_store_set(GTK_TREE_STORE(model), &iter, TITLE_COLUMN, str, VISIBLE_COLUMN, FALSE, PAGE_COLUMN, demo->page_index, -1);
g_free(str);
poppler_page_get_size(page, NULL, &height);
for (l = matches; l && l->data; l = g_list_next(l)) {
PopplerRectangle *rect = (PopplerRectangle *)l->data;
GtkTreeIter iter_child;
gchar *x1, *y1, *x2, *y2;
gdouble tmp;
str = g_strdup_printf("Match %d", ++n_match);
x1 = g_strdup_printf("%.2f", rect->x1);
y1 = g_strdup_printf("%.2f", rect->y1);
x2 = g_strdup_printf("%.2f", rect->x2);
y2 = g_strdup_printf("%.2f", rect->y2);
tmp = rect->y1;
rect->y1 = height - rect->y2;
rect->y2 = height - tmp;
gtk_tree_store_append(GTK_TREE_STORE(model), &iter_child, &iter);
gtk_tree_store_set(GTK_TREE_STORE(model), &iter_child, TITLE_COLUMN, str, X1_COLUMN, x1, Y1_COLUMN, y1, X2_COLUMN, x2, Y2_COLUMN, y2, VISIBLE_COLUMN, TRUE, PAGE_COLUMN, demo->page_index, PAGE_RECT, rect, -1);
g_free(str);
g_free(x1);
g_free(y1);
g_free(x2);
g_free(y2);
g_object_weak_ref(G_OBJECT(model), (GWeakNotify)poppler_rectangle_free, rect);
pgd_find_append_match(demo, model, &iter_child, rect, n_match);
if (!poppler_rectangle_find_get_match_continued(rect))
++n_match;
}
g_list_free(matches);
str = g_strdup_printf("%d matches found on page %d in %.4f seconds", n_match, demo->page_index + 1, g_timer_elapsed(timer, NULL));
gtk_tree_store_set(GTK_TREE_STORE(model), &iter, TITLE_COLUMN, str, VISIBLE_COLUMN, FALSE, PAGE_COLUMN, demo->page_index, -1);
g_free(str);
}
g_timer_destroy(timer);
......@@ -154,6 +156,11 @@ static gboolean pgd_find_find_text(PgdFindDemo *demo)
return demo->page_index < demo->n_pages;
}
static void find_text_idle_finish(PgdFindDemo *demo)
{
demo->idle_id = 0;
}
static cairo_surface_t *pgd_find_render_page(PgdFindDemo *demo)
{
cairo_t *cr;
......@@ -252,7 +259,7 @@ static void pgd_find_button_clicked(GtkButton *button, PgdFindDemo *demo)
pgd_find_update_progress(demo, demo->page_index);
if (demo->idle_id > 0)
g_source_remove(demo->idle_id);
demo->idle_id = g_idle_add((GSourceFunc)pgd_find_find_text, demo);
demo->idle_id = g_idle_add_full(G_PRIORITY_DEFAULT_IDLE, (GSourceFunc)pgd_find_find_text, demo, (GDestroyNotify)find_text_idle_finish);
}
static void pgd_find_button_sensitivity_cb(GtkWidget *button, GtkEntry *entry)
......@@ -309,6 +316,22 @@ static void pgd_find_backwards_toggled(GtkToggleButton *togglebutton, PgdFindDem
demo->options &= ~POPPLER_FIND_BACKWARDS;
}
static void pgd_find_multiline_toggled(GtkToggleButton *togglebutton, PgdFindDemo *demo)
{
if (gtk_toggle_button_get_active(togglebutton))
demo->options |= POPPLER_FIND_MULTILINE;
else
demo->options &= ~POPPLER_FIND_MULTILINE;
}
static void pgd_find_ignore_diacritics_toggled(GtkToggleButton *togglebutton, PgdFindDemo *demo)
{
if (gtk_toggle_button_get_active(togglebutton))
demo->options |= POPPLER_FIND_IGNORE_DIACRITICS;
else
demo->options &= ~POPPLER_FIND_IGNORE_DIACRITICS;
}
static void pgd_find_whole_words_toggled(GtkToggleButton *togglebutton, PgdFindDemo *demo)
{
if (gtk_toggle_button_get_active(togglebutton))
......@@ -345,6 +368,16 @@ GtkWidget *pgd_find_create_widget(PopplerDocument *document)
hbox = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 6);
checkbutton = gtk_check_button_new_with_label("Multi-line");
g_signal_connect(checkbutton, "toggled", G_CALLBACK(pgd_find_multiline_toggled), demo);
gtk_box_pack_start(GTK_BOX(hbox), checkbutton, FALSE, FALSE, 0);
gtk_widget_show(checkbutton);
checkbutton = gtk_check_button_new_with_label("Ignore diacritics");
g_signal_connect(checkbutton, "toggled", G_CALLBACK(pgd_find_ignore_diacritics_toggled), demo);
gtk_box_pack_start(GTK_BOX(hbox), checkbutton, FALSE, FALSE, 0);
gtk_widget_show(checkbutton);
demo->entry = gtk_entry_new();
gtk_box_pack_start(GTK_BOX(hbox), demo->entry, FALSE, TRUE, 0);
gtk_widget_show(demo->entry);
......
......@@ -47,6 +47,8 @@ enum
PROP_LABEL
};
static PopplerRectangleExtended *poppler_rectangle_extended_new();
typedef struct _PopplerPageClass PopplerPageClass;
struct _PopplerPageClass
{
......@@ -615,12 +617,7 @@ GList *poppler_page_get_selection_region(PopplerPage *page, gdouble scale, Poppl
for (const PDFRectangle *selection_rect : *list) {
PopplerRectangle *rect;
rect = poppler_rectangle_new();
rect->x1 = selection_rect->x1;
rect->y1 = selection_rect->y1;
rect->x2 = selection_rect->x2;
rect->y2 = selection_rect->y2;
rect = poppler_rectangle_new_from_pdf_rectangle(selection_rect);
region = g_list_prepend(region, rect);
......@@ -811,15 +808,33 @@ char *poppler_page_get_text_for_area(PopplerPage *page, PopplerRectangle *area)
* returns a #GList of rectangles for each occurrence of the text on the page.
* The coordinates are in PDF points.
*
* Return value: (element-type PopplerRectangle) (transfer full): a #GList of #PopplerRectangle,
* When %POPPLER_FIND_MULTILINE is passed in @options, matches may span more than
* one line. In this case, the returned list will contain one #PopplerRectangle
* for each part of a match. The function poppler_rectangle_find_get_match_continued()
* will return %TRUE for all rectangles belonging to the same match, except for
* the last one. If a hyphen was ignored at the end of the part of the match,
* poppler_rectangle_find_get_ignored_hyphen() will return %TRUE for that
* rectangle.
*
* Note that currently matches spanning more than two lines are not found.
* (This limitation may be lifted in a future version.)
*
* Note also that currently finding multi-line matches backwards is not
* implemented; if you pass %POPPLER_FIND_BACKWARDS and %POPPLER_FIND_MULTILINE
* together, %POPPLER_FIND_MULTILINE will be ignored.
*
* Return value: (element-type PopplerRectangle) (transfer full): a newly allocated list
* of newly allocated #PopplerRectangle. Free with g_list_free_full() using poppler_rectangle_free().
*
* Since: 0.22
**/
GList *poppler_page_find_text_with_options(PopplerPage *page, const char *text, PopplerFindFlags options)
{
PopplerRectangle *match;
PopplerRectangleExtended *match;
GList *matches;
double xMin, yMin, xMax, yMax;
PDFRectangle continueMatch;
bool ignoredHyphen;
gunichar *ucs4;
glong ucs4_len;
double height;
......@@ -835,22 +850,46 @@ GList *poppler_page_find_text_with_options(PopplerPage *page, const char *text,
ucs4 = g_utf8_to_ucs4_fast(text, -1, &ucs4_len);
poppler_page_get_size(page, nullptr, &height);
const bool multiline = (options & POPPLER_FIND_MULTILINE);
backwards = options & POPPLER_FIND_BACKWARDS;
matches = nullptr;
xMin = 0;
yMin = backwards ? height : 0;
continueMatch.x1 = G_MAXDOUBLE; // we use this to detect valid returned values
while (text_dev->findText(ucs4, ucs4_len, false, true, // startAtTop, stopAtBottom
start_at_last,
false, // stopAtLast
options & POPPLER_FIND_CASE_SENSITIVE, options & POPPLER_FIND_IGNORE_DIACRITICS, backwards, options & POPPLER_FIND_WHOLE_WORDS_ONLY, &xMin, &yMin, &xMax, &yMax)) {
match = poppler_rectangle_new();
options & POPPLER_FIND_CASE_SENSITIVE, options & POPPLER_FIND_IGNORE_DIACRITICS, options & POPPLER_FIND_MULTILINE, backwards, options & POPPLER_FIND_WHOLE_WORDS_ONLY, &xMin, &yMin, &xMax, &yMax, &continueMatch,
&ignoredHyphen)) {
match = poppler_rectangle_extended_new();
match->x1 = xMin;
match->y1 = height - yMax;
match->x2 = xMax;
match->y2 = height - yMin;
match->match_continued = false;
match->ignored_hyphen = false;
matches = g_list_prepend(matches, match);
start_at_last = TRUE;
if (continueMatch.x1 != G_MAXDOUBLE) {
// received rect for next-line part of a multi-line match, add it.
if (multiline) {
match->match_continued = true;
match->ignored_hyphen = ignoredHyphen;
match = poppler_rectangle_extended_new();
match->x1 = continueMatch.x1;
match->y1 = height - continueMatch.y1;
match->x2 = continueMatch.x2;
match->y2 = height - continueMatch.y2;
match->match_continued = false;
match->ignored_hyphen = false;
matches = g_list_prepend(matches, match);
}
continueMatch.x1 = G_MAXDOUBLE;
}
}
g_free(ucs4);
......@@ -1565,6 +1604,22 @@ void poppler_page_remove_annot(PopplerPage *page, PopplerAnnot *annot)
G_DEFINE_BOXED_TYPE(PopplerRectangle, poppler_rectangle, poppler_rectangle_copy, poppler_rectangle_free)
static PopplerRectangleExtended *poppler_rectangle_extended_new()
{
return g_slice_new0(PopplerRectangleExtended);
}
PopplerRectangle *poppler_rectangle_new_from_pdf_rectangle(const PDFRectangle *rect)
{
auto r = poppler_rectangle_extended_new();
r->x1 = rect->x1;
r->y1 = rect->y1;
r->x2 = rect->x2;
r->y2 = rect->y2;
return reinterpret_cast<PopplerRectangle *>(r);
}
/**
* poppler_rectangle_new:
*
......@@ -1574,36 +1629,95 @@ G_DEFINE_BOXED_TYPE(PopplerRectangle, poppler_rectangle, poppler_rectangle_copy,
*/
PopplerRectangle *poppler_rectangle_new(void)
{
return g_slice_new0(PopplerRectangle);
return reinterpret_cast<PopplerRectangle *>(poppler_rectangle_extended_new());
}
/**
* poppler_rectangle_copy:
* @rectangle: a #PopplerRectangle to copy
*
* Creates a copy of @rectangle
* Creates a copy of @rectangle.
*
* Note that you must only use this function on an allocated PopplerRectangle, as
* returned by poppler_rectangle_new(), poppler_rectangle_copy(), or the list elements
* returned from poppler_page_find_text() or poppler_page_find_text_with_options().
* Returns: a new allocated copy of @rectangle
*/
PopplerRectangle *poppler_rectangle_copy(PopplerRectangle *rectangle)
{
g_return_val_if_fail(rectangle != nullptr, NULL);
return g_slice_dup(PopplerRectangle, rectangle);
auto ext_rectangle = reinterpret_cast<PopplerRectangleExtended *>(rectangle);
return reinterpret_cast<PopplerRectangle *>(g_slice_dup(PopplerRectangleExtended, ext_rectangle));
}
/**
* poppler_rectangle_free:
* @rectangle: a #PopplerRectangle
*
* Frees the given #PopplerRectangle
* Frees the given #PopplerRectangle.
*
* Note that you must only use this function on an allocated PopplerRectangle, as
* returned by poppler_rectangle_new(), poppler_rectangle_copy(), or the list elements
* returned from poppler_page_find_text() or poppler_page_find_text_with_options().
*/
void poppler_rectangle_free(PopplerRectangle *rectangle)
{
g_slice_free(PopplerRectangle, rectangle);
}
/* PopplerPoint type */
/**
* poppler_rectangle_find_get_match_continued:
* @rectangle: a #PopplerRectangle
*
* When using poppler_page_find_text_with_options() with the
* %POPPLER_FIND_MULTILINE flag, a match may span more than one line
* and thus consist of more than one rectangle. Every rectangle belonging
* to the same match will return %TRUE from this function, except for
* the last rectangle, where this function will return %FALSE.
*
* Note that you must only call this function on a #PopplerRectangle
* returned in the list from poppler_page_find_text() or
* poppler_page_find_text_with_options().
*
* Returns: whether there are more rectangles belonging to the same match
*
* Since: 21.05.0
*/
gboolean poppler_rectangle_find_get_match_continued(const PopplerRectangle *rectangle)
{
g_return_val_if_fail(rectangle != nullptr, false);
auto ext_rectangle = reinterpret_cast<const PopplerRectangleExtended *>(rectangle);
return ext_rectangle->match_continued;
}
/**
* poppler_rectangle_find_get_ignored_hyphen:
* @rectangle: a #PopplerRectangle
*
* When using poppler_page_find_text_with_options() with the
* %POPPLER_FIND_MULTILINE flag, a match may span more than one line,
* and may have been formed by ignoring a hyphen at the end of the line.
* When this happens at the end of the line corresponding to @rectangle,
* this function returns %TRUE (and then poppler_rectangle_find_get_match_continued()
* will also return %TRUE); otherwise it returns %FALSE.
*
* Note that you must only call this function on a #PopplerRectangle
* returned in the list from poppler_page_find_text() or
* poppler_page_find_text_with_options().
*
* Returns: whether a hyphen was ignored at the end of the line corresponding to @rectangle.
*
* Since: 21.05.0
*/
gboolean poppler_rectangle_find_get_ignored_hyphen(const PopplerRectangle *rectangle)
{
g_return_val_if_fail(rectangle != nullptr, false);
auto ext_rectangle = reinterpret_cast<const PopplerRectangleExtended *>(rectangle);
return ext_rectangle->ignored_hyphen;
}
G_DEFINE_BOXED_TYPE(PopplerPoint, poppler_point, poppler_point_copy, poppler_point_free)
......
......@@ -140,6 +140,10 @@ POPPLER_PUBLIC
PopplerRectangle *poppler_rectangle_copy(PopplerRectangle *rectangle);
POPPLER_PUBLIC
void poppler_rectangle_free(PopplerRectangle *rectangle);
POPPLER_PUBLIC
gboolean poppler_rectangle_find_get_match_continued(const PopplerRectangle *rectangle);
POPPLER_PUBLIC
gboolean poppler_rectangle_find_get_ignored_hyphen(const PopplerRectangle *rectangle);
/* A point on a page, with coordinates in PDF points. */
#define POPPLER_TYPE_POINT (poppler_point_get_type())
......
......@@ -112,6 +112,25 @@ struct _PopplerStructureElement
const StructElement *elem;
};
/*
* PopplerRectangleExtended:
*
* The real type behind the public PopplerRectangle.
* Must be ABI compatible to it!
*/
typedef struct
{
/*< private >*/
double x1;
double y1;
double x2;
double y2;
bool match_continued; /* Described in poppler_rectangle_find_get_match_continued() */
bool ignored_hyphen; /* Described in poppler_rectangle_find_get_ignored_hyphen() */
} PopplerRectangleExtended;
PopplerRectangle *poppler_rectangle_new_from_pdf_rectangle(const PDFRectangle *rect);
GList *_poppler_document_get_layers(PopplerDocument *document);
GList *_poppler_document_get_layer_rbgroup(PopplerDocument *document, Layer *layer);
PopplerPage *_poppler_page_new(PopplerDocument *document, Page *page, int index);
......
......@@ -157,6 +157,10 @@ typedef enum /*< flags >*/
* @POPPLER_FIND_IGNORE_DIACRITICS: do diacritics insensitive search,
* i.e. ignore accents, umlauts, diaeresis,etc. while matching. This
* option will be ignored if the search term is not pure ascii. Since 0.73.
* @POPPLER_FIND_MULTILINE: allows to match on text spanning from
* end of a line to the next line. (Currently it won't match on text spanning
* more than two lines.) Automatically ignores hyphen at end of line, and
* allows whitespace in search term to match on newline char. Since: 21.05.0.
*
* Flags using while searching text in a page
*
......@@ -168,7 +172,8 @@ typedef enum /*< flags >*/
POPPLER_FIND_CASE_SENSITIVE = 1 << 0,
POPPLER_FIND_BACKWARDS = 1 << 1,
POPPLER_FIND_WHOLE_WORDS_ONLY = 1 << 2,
POPPLER_FIND_IGNORE_DIACRITICS = 1 << 3
POPPLER_FIND_IGNORE_DIACRITICS = 1 << 3,
POPPLER_FIND_MULTILINE = 1 << 4
} PopplerFindFlags;
typedef struct _PopplerDocument PopplerDocument;
......
......@@ -92,6 +92,8 @@ poppler_quadrilateral_copy
poppler_quadrilateral_free
poppler_quadrilateral_new
poppler_rectangle_copy
poppler_rectangle_find_get_match_continued
poppler_rectangle_find_get_ignored_hyphen
poppler_rectangle_free
poppler_rectangle_new
poppler_text_attributes_copy
......
......@@ -3798,22 +3798,62 @@ void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
#endif
}
void TextPage::adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax)
{
switch (line->rot) {
case 0:
*xMin = line->edge[start];
*xMax = line->edge[end];
*yMin = line->yMin;
*yMax = line->yMax;
break;
case 1:
*xMin = line->xMin;
*xMax = line->xMax;
*yMin = line->edge[start];
*yMax = line->edge[end];
break;
case 2:
*xMin = line->edge[end];
*xMax = line->edge[start];
*yMin = line->yMin;
*yMax = line->yMax;
break;
case 3:
*xMin = line->xMin;
*xMax = line->xMax;
*yMin = line->edge[end];
*yMax = line->edge[start];
break;
}
}
bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax)
{
return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, false, backward, wholeWord, xMin, yMin, xMax, yMax);
return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, false, false, backward, wholeWord, xMin, yMin, xMax, yMax, nullptr, nullptr);
}
bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
double *yMax)
{
return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics, false, backward, wholeWord, xMin, yMin, xMax, yMax, nullptr, nullptr);
}
bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin,
double *yMin, double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen)
{
TextBlock *blk;
TextLine *line;
Unicode *s2, *txt, *reordered;
Unicode *p;
Unicode *nextline;
int nextline_len;
bool nextlineAfterHyphen = false;
int txtSize, m, i, j, k;
double xStart, yStart, xStop, yStop;
double xMin0, yMin0, xMax0, yMax0;
double xMin1, yMin1, xMax1, yMax1;
double xMin2, yMin2, xMax2, yMax2;
bool found;
if (len == 0) {
......@@ -3824,6 +3864,11 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
return false;
}
if (matchAcrossLines && backward) {
// matchAcrossLines is unimplemented for backward search
matchAcrossLines = false;
}
// handle right-to-left text
reordered = (Unicode *)gmallocn(len, sizeof(Unicode));
reorderText(s, len, nullptr, primaryLR, nullptr, reordered);
......@@ -3907,6 +3952,12 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
if (!line->normalized)
line->normalized = unicodeNormalizeNFKC(line->text, line->len, &line->normalized_len, &line->normalized_idx, true);
if (matchAcrossLines && line->next && !line->next->normalized)
line->next->normalized = unicodeNormalizeNFKC(line->next->text, line->next->len, &line->next->normalized_len, &line->next->normalized_idx, true);
nextline = nullptr;
nextline_len = 0;
// convert the line to uppercase
m = line->normalized_len;
......@@ -3917,6 +3968,9 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
m = line->ascii_len;
else
ignoreDiacritics = false;
if (matchAcrossLines && line->next && !line->next->ascii_translation)
unicodeToAscii7(line->next->normalized, line->next->normalized_len, &line->next->ascii_translation, &line->next->ascii_len, line->next->normalized_idx, &line->next->ascii_idx);
}
if (!caseSensitive) {
if (m > txtSize) {
......@@ -3929,65 +3983,111 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
else
txt[k] = unicodeToUpper(line->normalized[k]);
}
if (matchAcrossLines && line->next) {
nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
nextline = (Unicode *)gmallocn(nextline_len, sizeof(Unicode));
for (k = 0; k < nextline_len; ++k) {
nextline[k] = ignoreDiacritics ? unicodeToUpper(line->next->ascii_translation[k]) : unicodeToUpper(line->next->normalized[k]);
}
}
} else {
if (ignoreDiacritics)
txt = line->ascii_translation;
else
txt = line->normalized;
if (matchAcrossLines && line->next) {
nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
nextline = ignoreDiacritics ? line->next->ascii_translation : line->next->normalized;
}
}
// search each position in this line
j = backward ? m - len : 0;
p = txt + j;
while (backward ? j >= 0 : j <= m - len) {
if (!wholeWord || ((j == 0 || !unicodeTypeAlphaNum(txt[j - 1])) && (j + len == m || !unicodeTypeAlphaNum(txt[j + len])))) {
while (backward ? j >= 0 : j <= m - (nextline ? 1 : len)) {
bool wholeWordStartIsOk, wholeWordEndIsOk;
if (wholeWord) {
wholeWordStartIsOk = j == 0 || !unicodeTypeAlphaNum(txt[j - 1]);
if (nextline)
wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later
else
wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
}
if (!wholeWord || (wholeWordStartIsOk && wholeWordEndIsOk)) {
int n = 0;
bool spaceConsumedByNewline = false;
bool found_it;
// compare the strings
for (k = 0; k < len; ++k) {
if (p[k] != s2[k]) {
bool last_char_of_line = j + k == m - 1;
bool last_char_of_search_term = k == len - 1;
if (p[k] != s2[k] || (nextline && last_char_of_line && !last_char_of_search_term)) {
// now check if the comparison failed at the end-of-line hyphen,
// and if so, keep on comparing at the next line
nextlineAfterHyphen = false;
if (s2[k] == p[k]) {
if (p[k] != (Unicode)'-' && !UnicodeIsWhitespace(s2[k + 1])) {
break;
}
k++;
} else if (p[k] != (Unicode)'-' || UnicodeIsWhitespace(s2[k]))
break;
else
nextlineAfterHyphen = true;