diff --git a/glib/demo/find.c b/glib/demo/find.c index 8b02dfc6dba62b93932d2238f72c8dba69c796ed..52802055fc2c195543e6121219e106955fd59e11 100644 --- a/glib/demo/find.c +++ b/glib/demo/find.c @@ -88,6 +88,41 @@ pgd_find_update_progress (PgdFindDemo *demo, g_free (str); } +static void +pgd_find_append_match (PgdFindDemo *demo, + GtkTreeModel *model, + GtkTreeIter *iter_child, + PopplerRectangle *rect, + int match_id) +{ + char *x1, *y1, *x2, *y2, *str; + + str = g_strdup_printf ("Match %d", match_id + 1); + x1 = g_strdup_printf ("%.2f", rect->x1); + y1 = g_strdup_printf ("%.2f", rect->y1); + x2 = g_strdup_printf ("%.2f", rect->x2); + y2 = g_strdup_printf ("%.2f", rect->y2); + + gtk_tree_store_set (GTK_TREE_STORE (model), iter_child, + TITLE_COLUMN, str, + X1_COLUMN, x1, + Y1_COLUMN, y1, + X2_COLUMN, x2, + Y2_COLUMN, y2, + VISIBLE_COLUMN, TRUE, + PAGE_COLUMN, demo->page_index, + PAGE_RECT, rect, + -1); + g_free (str); + g_free (x1); + g_free (y1); + g_free (x2); + g_free (y2); + g_object_weak_ref (G_OBJECT (model), + (GWeakNotify)poppler_rectangle_free, + rect); +} + static gboolean pgd_find_find_text (PgdFindDemo *demo) { @@ -107,63 +142,43 @@ pgd_find_find_text (PgdFindDemo *demo) matches = poppler_page_find_text_with_options (page, gtk_entry_get_text (GTK_ENTRY (demo->entry)), demo->options); g_timer_stop (timer); if (matches) { - GtkTreeIter iter; + GtkTreeIter iter, iter_child; gchar *str; GList *l; gdouble height; gint n_match = 0; - str = g_strdup_printf ("%d matches found on page %d in %.4f seconds", - g_list_length (matches), demo->page_index + 1, - g_timer_elapsed (timer, NULL)); - gtk_tree_store_append (GTK_TREE_STORE (model), &iter, NULL); - gtk_tree_store_set (GTK_TREE_STORE (model), &iter, - TITLE_COLUMN, str, - VISIBLE_COLUMN, FALSE, - PAGE_COLUMN, demo->page_index, - -1); - g_free (str); poppler_page_get_size (page, NULL, &height); for (l = matches; l && l->data; l = g_list_next (l)) { PopplerRectangle *rect = (PopplerRectangle *)l->data; - GtkTreeIter iter_child; - gchar *x1, *y1, *x2, *y2; - gdouble tmp; - - str = g_strdup_printf ("Match %d", ++n_match); - x1 = g_strdup_printf ("%.2f", rect->x1); - y1 = g_strdup_printf ("%.2f", rect->y1); - x2 = g_strdup_printf ("%.2f", rect->x2); - y2 = g_strdup_printf ("%.2f", rect->y2); + gdouble tmp; - tmp = rect->y1; - rect->y1 = height - rect->y2; - rect->y2 = height - tmp; + tmp = rect->y1; + rect->y1 = height - rect->y2; + rect->y2 = height - tmp; gtk_tree_store_append (GTK_TREE_STORE (model), &iter_child, &iter); - gtk_tree_store_set (GTK_TREE_STORE (model), &iter_child, - TITLE_COLUMN, str, - X1_COLUMN, x1, - Y1_COLUMN, y1, - X2_COLUMN, x2, - Y2_COLUMN, y2, - VISIBLE_COLUMN, TRUE, - PAGE_COLUMN, demo->page_index, - PAGE_RECT, rect, - -1); - g_free (str); - g_free (x1); - g_free (y1); - g_free (x2); - g_free (y2); - g_object_weak_ref (G_OBJECT (model), - (GWeakNotify)poppler_rectangle_free, - rect); + pgd_find_append_match (demo, model, &iter_child, rect, n_match); + + if (!poppler_rectangle_find_get_match_continued (rect)) + ++n_match; + } g_list_free (matches); + + str = g_strdup_printf ("%d matches found on page %d in %.4f seconds", + n_match, demo->page_index + 1, + g_timer_elapsed (timer, NULL)); + + gtk_tree_store_set (GTK_TREE_STORE (model), &iter, + TITLE_COLUMN, str, + VISIBLE_COLUMN, FALSE, + PAGE_COLUMN, demo->page_index, + -1); + g_free (str); } g_timer_destroy (timer); @@ -356,6 +371,16 @@ pgd_find_backwards_toggled (GtkToggleButton *togglebutton, demo->options &= ~POPPLER_FIND_BACKWARDS; } +static void +pgd_find_multiline_toggled (GtkToggleButton *togglebutton, + PgdFindDemo *demo) +{ + if (gtk_toggle_button_get_active (togglebutton)) + demo->options |= POPPLER_FIND_MULTILINE; + else + demo->options &= ~POPPLER_FIND_MULTILINE; +} + static void pgd_find_whole_words_toggled (GtkToggleButton *togglebutton, PgdFindDemo *demo) @@ -421,6 +446,13 @@ pgd_find_create_widget (PopplerDocument *document) hbox = gtk_box_new (GTK_ORIENTATION_HORIZONTAL, 6); + checkbutton = gtk_check_button_new_with_label ("Multi-line"); + g_signal_connect (checkbutton, "toggled", + G_CALLBACK (pgd_find_multiline_toggled), + demo); + gtk_box_pack_start (GTK_BOX (hbox), checkbutton, FALSE, FALSE, 0); + gtk_widget_show (checkbutton); + checkbutton = gtk_check_button_new_with_label ("Case sensitive"); g_signal_connect (checkbutton, "toggled", G_CALLBACK (pgd_find_case_sensitive_toggled), diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc index 134557e0c4f7104340a6fd3a9eb95d6e79982925..5d83a74f285ee86b9c503f368ea70962faca6848 100644 --- a/glib/poppler-page.cc +++ b/glib/poppler-page.cc @@ -44,6 +44,8 @@ enum PROP_LABEL }; +static PopplerRectangleExtended* poppler_rectangle_extended_new (void); + typedef struct _PopplerPageClass PopplerPageClass; struct _PopplerPageClass { @@ -697,13 +699,7 @@ poppler_page_get_selection_region (PopplerPage *page, PDFRectangle *selection_rect = (*list)[i]; PopplerRectangle *rect; - rect = poppler_rectangle_new (); - - rect->x1 = selection_rect->x1; - rect->y1 = selection_rect->y1; - rect->x2 = selection_rect->x2; - rect->y2 = selection_rect->y2; - + rect = poppler_rectangle_new_from_pdf_rectangle (selection_rect); region = g_list_prepend (region, rect); delete selection_rect; @@ -907,7 +903,24 @@ poppler_page_get_text_for_area (PopplerPage *page, * returns a #GList of rectangles for each occurrence of the text on the page. * The coordinates are in PDF points. * - * Return value: (element-type PopplerRectangle) (transfer full): a #GList of #PopplerRectangle, + * When %POPPLER_FIND_MULTILINE is passed in @options, matches may span more than + * one line. In this case, the returned list will contain one #PopplerRectangle + * for each part of a match. The function poppler_rectangle_find_get_match_continued() + * will return %TRUE for all rectangles belonging to the same match, except for + * the last one. If a hyphen was ignored at the end of the part of the match, + * poppler_rectangle_find_get_ignored_hyphen() will return %TRUE for that + * rectangle. + * + * Note that currently matches spanning more than two lines are not found. + * (This limitation may be lifted in a future version.) + * + * Note also that currently finding multi-line matches backwards is not + * implemented; if you pass %POPPLER_FIND_BACKWARDS and %POPPLER_FIND_MULTILINE + * together, %POPPLER_FIND_MULTILINE will be ignored. + * + * Return value: (element-type PopplerRectangle) (transfer full): a newly allocated list + * of newly allocated #PopplerRectangle. Free with + * g_list_free_full() using poppler_rectangle_free(). * * Since: 0.22 **/ @@ -916,9 +929,11 @@ poppler_page_find_text_with_options (PopplerPage *page, const char *text, PopplerFindFlags options) { - PopplerRectangle *match; + PopplerRectangleExtended *match; GList *matches; double xMin, yMin, xMax, yMax; + double xMinNext, yMinNext, xMaxNext, yMaxNext; + bool afterHyphen; gunichar *ucs4; glong ucs4_len; double height; @@ -934,10 +949,12 @@ poppler_page_find_text_with_options (PopplerPage *page, ucs4 = g_utf8_to_ucs4_fast (text, -1, &ucs4_len); poppler_page_get_size (page, nullptr, &height); + const bool multiline = (options & POPPLER_FIND_MULTILINE); backwards = options & POPPLER_FIND_BACKWARDS; matches = nullptr; xMin = 0; yMin = backwards ? height : 0; + xMinNext = G_MAXDOUBLE; //we use this to detect valid returned values while (text_dev->findText (ucs4, ucs4_len, false, true, // startAtTop, stopAtBottom @@ -945,17 +962,41 @@ poppler_page_find_text_with_options (PopplerPage *page, false, //stopAtLast options & POPPLER_FIND_CASE_SENSITIVE, options & POPPLER_FIND_IGNORE_DIACRITICS, + options & POPPLER_FIND_MULTILINE, backwards, options & POPPLER_FIND_WHOLE_WORDS_ONLY, - &xMin, &yMin, &xMax, &yMax)) + &xMin, &yMin, &xMax, &yMax, + &xMinNext, &yMinNext, &xMaxNext, &yMaxNext, &afterHyphen)) { - match = poppler_rectangle_new (); + match = poppler_rectangle_extended_new (); match->x1 = xMin; match->y1 = height - yMax; match->x2 = xMax; match->y2 = height - yMin; + match->match_continued = false; + match->ignored_hyphen = false; matches = g_list_prepend (matches, match); + start_at_last = TRUE; + + if (xMinNext != G_MAXDOUBLE) { + // received rect for next-line part of a across-lines match, add it. + if (multiline) { + match->match_continued = true; + match->ignored_hyphen = afterHyphen; + + match = poppler_rectangle_extended_new (); + match->x1 = xMinNext; + match->y1 = height - yMaxNext; + match->x2 = xMaxNext; + match->y2 = height - yMinNext; + match->match_continued = false; + match->ignored_hyphen = false; + matches = g_list_prepend (matches, match); + } + + xMinNext = G_MAXDOUBLE; + } } g_free (ucs4); @@ -1553,6 +1594,24 @@ POPPLER_DEFINE_BOXED_TYPE (PopplerRectangle, poppler_rectangle, poppler_rectangle_copy, poppler_rectangle_free) +static PopplerRectangleExtended* +poppler_rectangle_extended_new (void) +{ + return g_slice_new0 (PopplerRectangleExtended); +} + +PopplerRectangle* +poppler_rectangle_new_from_pdf_rectangle (const PDFRectangle* rect) +{ + auto r = poppler_rectangle_extended_new (); + r->x1 = rect->x1; + r->y1 = rect->y1; + r->x2 = rect->x2; + r->y2 = rect->y2; + + return reinterpret_cast(r); +} + /** * poppler_rectangle_new: * @@ -1563,14 +1622,18 @@ POPPLER_DEFINE_BOXED_TYPE (PopplerRectangle, poppler_rectangle, PopplerRectangle * poppler_rectangle_new (void) { - return g_slice_new0 (PopplerRectangle); + return reinterpret_cast(poppler_rectangle_extended_new ()); } /** * poppler_rectangle_copy: * @rectangle: a #PopplerRectangle to copy * - * Creates a copy of @rectangle + * Creates a copy of @rectangle. + * + * Note that you must only use this function on an allocated PopplerRectangle, as + * returned by poppler_rectangle_new(), poppler_rectangle_copy(), or the list elements + * returned from poppler_page_find_text() or poppler_page_find_text_with_options(). * * Returns: a new allocated copy of @rectangle */ @@ -1579,22 +1642,104 @@ poppler_rectangle_copy (PopplerRectangle *rectangle) { g_return_val_if_fail (rectangle != nullptr, NULL); - return g_slice_dup (PopplerRectangle, rectangle); + auto ext_rectangle = reinterpret_cast(rectangle); + return reinterpret_cast(g_slice_dup (PopplerRectangleExtended, ext_rectangle)); } /** * poppler_rectangle_free: * @rectangle: a #PopplerRectangle * - * Frees the given #PopplerRectangle + * Frees the given #PopplerRectangle. + * + * Note that you must only use this function on an allocated PopplerRectangle, as + * returned by poppler_rectangle_new(), poppler_rectangle_copy(), or the list elements + * returned from poppler_page_find_text() or poppler_page_find_text_with_options(). */ void poppler_rectangle_free (PopplerRectangle *rectangle) { - g_slice_free (PopplerRectangle, rectangle); + auto ext_rectangle = reinterpret_cast(rectangle); + g_slice_free (PopplerRectangleExtended, ext_rectangle); +} + +/** + * poppler_rectangle_to_cairo: + * @rectangle: a #PopplerRectangle + * @cairo_rectangle: (out): a #cairo_rectangle_t to fill in + * + * Transforms @rectangle into a #cairo_rectangle_t, i.e.. + * + * Since: 0.78 + */ +void +poppler_rectangle_to_cairo (const PopplerRectangle *rectangle, + cairo_rectangle_t *cairo_rectangle) +{ + g_return_if_fail (rectangle != nullptr); + g_return_if_fail (cairo_rectangle != nullptr); + + cairo_rectangle->x = rectangle->x1; + cairo_rectangle->y = rectangle->y1; + cairo_rectangle->width = rectangle->x2 - rectangle->x1; + cairo_rectangle->height = rectangle->y2 - rectangle->y1; } -/* PopplerPoint type */ +/** + * pango_rectangle_find_get_match_continued: + * @rectangle: a #PopplerRectangle + * + * When using poppler_page_find_text_with_options() with the + * %POPPLER_FIND_MULTILINE flag, a match may span more than one line + * and thus consist of more than one rectangle. Every rectangle belonging + * to the same match will return %TRUE from this function, except for + * the last rectangle, where this function will return %FALSE. + * + * Note that you must only call this function on a #PopplerRectangle + * returned in the list from poppler_page_find_text() or + * poppler_page_find_text_with_options(). + * + * Returns: whether there are more rectangles belonging to the same match + * + * Since: 0.78 + */ +gboolean +poppler_rectangle_find_get_match_continued (const PopplerRectangle *rectangle) +{ + g_return_val_if_fail (rectangle != nullptr, false); + + auto ext_rectangle = reinterpret_cast(rectangle); + return ext_rectangle->match_continued; +} + +/** + * pango_rectangle_find_get_ignored_hyphen: + * @rectangle: a #PopplerRectangle + * + * When using poppler_page_find_text_with_options() with the + * %POPPLER_FIND_MULTILINE flag, a match may span more than one line, + * and may have been formed by ignoring a hyphen at the end of the line. + * When this happens at the end of the line corresponding to @rectangle, + * this function returns %TRUE (and then poppler_rectangle_find_get_match_continued() + * will also return %TRUE); otherwise it returns %FALSE. + * + * Note that you must only call this function on a #PopplerRectangle + * returned in the list from poppler_page_find_text() or + * poppler_page_find_text_with_options(). + * + * Returns: whether a hyphen was ignored at the end of the line corresponding + * to @rectangle. + * + * Since: 0.78 + */ +gboolean +poppler_rectangle_find_get_ignored_hyphen (const PopplerRectangle *rectangle) +{ + g_return_val_if_fail (rectangle != nullptr, false); + + auto ext_rectangle = reinterpret_cast(rectangle); + return ext_rectangle->ignored_hyphen; +} POPPLER_DEFINE_BOXED_TYPE (PopplerPoint, poppler_point, poppler_point_copy, diff --git a/glib/poppler-page.h b/glib/poppler-page.h index f99f0920673bb7cb5d6af80f2deb9d8879d32857..41f5b3a8cb30ffc8cb339d5abce8e4fb1ddc1ec2 100644 --- a/glib/poppler-page.h +++ b/glib/poppler-page.h @@ -176,6 +176,13 @@ POPPLER_PUBLIC PopplerRectangle *poppler_rectangle_copy (PopplerRectangle *rectangle); POPPLER_PUBLIC void poppler_rectangle_free (PopplerRectangle *rectangle); +POPPLER_PUBLIC +void poppler_rectangle_to_cairo (const PopplerRectangle *rectangle, + cairo_rectangle_t *cairo_rectangle); +POPPLER_PUBLIC +gboolean poppler_rectangle_find_get_match_continued (const PopplerRectangle *rectangle); +POPPLER_PUBLIC +gboolean poppler_rectangle_find_get_ignored_hyphen (const PopplerRectangle *rectangle); /* A point on a page, with coordinates in PDF points. */ #define POPPLER_TYPE_POINT (poppler_point_get_type ()) diff --git a/glib/poppler-private.h b/glib/poppler-private.h index 3e6b4b17ae6efc1b8a605fe7c3ff6f5d436c30f5..e3ae2f28f6ec8fbfc4d2ae6f55cf3655b7c4251f 100644 --- a/glib/poppler-private.h +++ b/glib/poppler-private.h @@ -109,6 +109,24 @@ struct _PopplerStructureElement const StructElement *elem; }; +/* + * PopplerRectangleExtended: + * + * The real type behind the public PopplerRectangle. + * Must be ABI compatible to it! + */ +typedef struct { + /*< private >*/ + double x1; + double y1; + double x2; + double y2; + bool match_continued; + bool ignored_hyphen; +} PopplerRectangleExtended; + +PopplerRectangle* poppler_rectangle_new_from_pdf_rectangle (const PDFRectangle* rect); + GList *_poppler_document_get_layers (PopplerDocument *document); GList *_poppler_document_get_layer_rbgroup (PopplerDocument *document, Layer *layer); diff --git a/glib/poppler.h b/glib/poppler.h index 91b0b4f9b6d4c6ad4af66f20d6a14cac7eba758a..298db6c11f24d5ac5a234678abfc7d532f6d3611 100644 --- a/glib/poppler.h +++ b/glib/poppler.h @@ -157,6 +157,10 @@ typedef enum /*< flags >*/ * @POPPLER_FIND_IGNORE_DIACRITICS: do diacritics insensitive search, * i.e. ignore accents, umlauts, diaeresis,etc. while matching. This * option will be ignored if the search term is not pure ascii. Since 0.73. + * @POPPLER_FIND_MULTILINE: allows to match on text spanning from + * end of a line to the next line. (Currently it won't match on text spanning + * more than two lines.) Automatically ignores hyphen at end of line, and + * allows whitespace in search term to match on newline char. Since: 0.78. * * Flags using while searching text in a page * @@ -168,7 +172,8 @@ typedef enum /*< flags >*/ POPPLER_FIND_CASE_SENSITIVE = 1 << 0, POPPLER_FIND_BACKWARDS = 1 << 1, POPPLER_FIND_WHOLE_WORDS_ONLY = 1 << 2, - POPPLER_FIND_IGNORE_DIACRITICS = 1 << 3 + POPPLER_FIND_IGNORE_DIACRITICS = 1 << 3, + POPPLER_FIND_MULTILINE = 1 << 4 } PopplerFindFlags; typedef struct _PopplerDocument PopplerDocument; diff --git a/glib/reference/poppler-sections.txt b/glib/reference/poppler-sections.txt index 2da8aea5a8194ae2405643546558689deeed95d8..b56c3d7a3bbb599f557646d3248c7e4056fd60b9 100644 --- a/glib/reference/poppler-sections.txt +++ b/glib/reference/poppler-sections.txt @@ -93,6 +93,9 @@ poppler_quadrilateral_new poppler_rectangle_copy poppler_rectangle_free poppler_rectangle_new +poppler_rectangle_to_cairo +poppler_rectangle_find_get_match_continued +poppler_rectangle_find_get_ignored_hyphen poppler_text_attributes_copy poppler_text_attributes_free poppler_text_attributes_new diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 391904b5ed11eae06eb56a794374c285ce87e59d..94e9dcd4535153b8568ec5f2d24b5eabaf7a10e0 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -3871,8 +3871,8 @@ bool TextPage::findText(Unicode *s, int len, double *xMin, double *yMin, double *xMax, double *yMax) { return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, - caseSensitive, false, backward, wholeWord, - xMin, yMin, xMax, yMax); + caseSensitive, false, false, backward, wholeWord, + xMin, yMin, xMax, yMax, nullptr, nullptr, nullptr, nullptr, nullptr); } bool TextPage::findText(Unicode *s, int len, @@ -3882,10 +3882,28 @@ bool TextPage::findText(Unicode *s, int len, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) { + return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, + caseSensitive, ignoreDiacritics, false, backward, wholeWord, + xMin, yMin, xMax, yMax, nullptr, nullptr, nullptr, nullptr, nullptr); +} + +bool TextPage::findText(Unicode *s, int len, + bool startAtTop, bool stopAtBottom, + bool startAtLast, bool stopAtLast, + bool caseSensitive, bool ignoreDiacritics, + bool matchAcrossLines, + bool backward, bool wholeWord, + double *xMin, double *yMin, + double *xMax, double *yMax, + double *xMinNext, double *yMinNext, + double *xMaxNext, double *yMaxNext, bool *afterHyphen) { TextBlock *blk; TextLine *line; Unicode *s2, *txt, *reordered; Unicode *p; + Unicode *nextline; + int nextline_len; + bool nextlineAfterHyphen = false; int txtSize, m, i, j, k; double xStart, yStart, xStop, yStop; double xMin0, yMin0, xMax0, yMax0; @@ -3896,6 +3914,10 @@ bool TextPage::findText(Unicode *s, int len, return false; } + if (matchAcrossLines && backward) { + // matchAcrossLines is unimplemented for backward search + matchAcrossLines = false; + } // handle right-to-left text reordered = (Unicode*)gmallocn(len, sizeof(Unicode)); reorderText(s, len, nullptr, primaryLR, nullptr, reordered); @@ -3988,6 +4010,16 @@ bool TextPage::findText(Unicode *s, int len, &line->normalized_len, &line->normalized_idx, true); + + if (matchAcrossLines && line->next && !line->next->normalized) + line->next->normalized = unicodeNormalizeNFKC(line->next->text, line->next->len, + &line->next->normalized_len, + &line->next->normalized_idx, + true); + + nextline = nullptr; + nextline_len = 0; + // convert the line to uppercase m = line->normalized_len; @@ -4003,6 +4035,14 @@ bool TextPage::findText(Unicode *s, int len, m = line->ascii_len; else ignoreDiacritics = false; + + if (matchAcrossLines && line->next && !line->next->ascii_translation) + unicodeToAscii7(line->next->normalized, + line->next->normalized_len, + &line->next->ascii_translation, + &line->next->ascii_len, + line->next->normalized_idx, + &line->next->ascii_idx); } if (!caseSensitive) { if (m > txtSize) { @@ -4015,40 +4055,111 @@ bool TextPage::findText(Unicode *s, int len, else txt[k] = unicodeToUpper(line->normalized[k]); } + + if (matchAcrossLines && line->next) { + nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len; + nextline = (Unicode *) gmallocn(nextline_len, sizeof(Unicode)); + for (k = 0; k < nextline_len; ++k) { + nextline[k] = ignoreDiacritics ? unicodeToUpper(line->next->ascii_translation[k]) + : unicodeToUpper(line->next->normalized[k]); + } + } } else { if (ignoreDiacritics) txt = line->ascii_translation; else txt = line->normalized; + + if (matchAcrossLines && line->next) { + nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len; + nextline = ignoreDiacritics ? line->next->ascii_translation : line->next->normalized; + } } // search each position in this line j = backward ? m - len : 0; p = txt + j; - while (backward ? j >= 0 : j <= m - len) { - if (!wholeWord || - ((j == 0 || !unicodeTypeAlphaNum(txt[j - 1])) && - (j + len == m || !unicodeTypeAlphaNum(txt[j + len])))) { + while (backward ? j >= 0 : j <= m - (nextline ? 1 : len)) { + bool wholeWordStartIsOk, wholeWordEndIsOk; + + if (wholeWord) { + wholeWordStartIsOk = j == 0 || !unicodeTypeAlphaNum(txt[j - 1]); + + if (nextline) + wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later + else + wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]); + } + + if (!wholeWord || (wholeWordStartIsOk && wholeWordEndIsOk)) { + int n = 0; + bool spaceConsumedByNewline = false; + bool found_it; // compare the strings for (k = 0; k < len; ++k) { - if (p[k] != s2[k]) { + bool last_char_of_line = j + k == m - 1; + bool last_char_of_search_term = k == len - 1; + + if (p[k] != s2[k] || (nextline && last_char_of_line && !last_char_of_search_term)) { + // now check if the comparison failed at the end-of-line hyphen, + // and if so, keep on comparing at the next line + nextlineAfterHyphen = false; + + if (s2[k] == p[k]) + k++; + else if (p[k] != (Unicode)'-') + break; + else + nextlineAfterHyphen = true; + + for (; n < nextline_len && k < len; ++k, ++n) { + if (nextline[n] != s2[k]) { + if (!spaceConsumedByNewline && !n && UnicodeIsWhitespace(s2[k])) { + n = -1; + spaceConsumedByNewline = true; + continue; + } + break; + } + } + break; } } + found_it = k == len; + + if (found_it && nextline && wholeWord) { // check word end for nextline case + if (n) { // Match ended at next line + wholeWordEndIsOk = n == nextline_len || !unicodeTypeAlphaNum(nextline[n]); + } else { // Match ended on same line + wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]); + } + if (!wholeWordEndIsOk) + found_it = false; + } // found it - if (k == len) { + if (found_it) { + bool nextLineMatch = (bool) n; + if (spaceConsumedByNewline) + k--; // where s2 matches a subsequence of a compatibility equivalence // decomposition, highlight the entire glyph, since we don't know // the internal layout of subglyph components int normStart, normAfterEnd; if (ignoreDiacritics) { normStart = line->ascii_idx[j]; - normAfterEnd = line->ascii_idx[j + len - 1] + 1; + if (nextline) + normAfterEnd = line->ascii_idx[j + k - n]; + else + normAfterEnd = line->ascii_idx[j + len - 1] + 1; } else { normStart = line->normalized_idx[j]; - normAfterEnd = line->normalized_idx[j + len - 1] + 1; + if (nextline) + normAfterEnd = line->normalized_idx[j + k - n]; + else + normAfterEnd = line->normalized_idx[j + len - 1] + 1; } switch (line->rot) { case 0: @@ -4102,6 +4213,36 @@ bool TextPage::findText(Unicode *s, int len, yMin0 = yMin1; yMax0 = yMax1; found = true; + if (nextLineMatch) { + // set the out parameters + if (afterHyphen) { *afterHyphen = nextlineAfterHyphen; } + switch (line->next->rot) { + case 0: + if (xMinNext) { *xMinNext = line->next->edge[0]; } + if (xMaxNext) { *xMaxNext = line->next->edge[n]; } + if (yMinNext) { *yMinNext = line->next->yMin; } + if (yMaxNext) { *yMaxNext = line->next->yMax; } + break; + case 1: + if (xMinNext) { *xMinNext = line->next->xMin; } + if (xMaxNext) { *xMaxNext = line->next->xMax; } + if (yMinNext) { *yMinNext = line->next->edge[0]; } + if (yMaxNext) { *yMaxNext = line->next->edge[n]; } + break; + case 2: + if (xMinNext) { *xMinNext = line->next->edge[n]; } + if (xMaxNext) { *xMaxNext = line->next->edge[0]; } + if (yMinNext) { *yMinNext = line->next->yMin; } + if (yMaxNext) { *yMaxNext = line->next->yMax; } + break; + case 3: + if (xMinNext) { *xMinNext = line->next->xMin; } + if (xMaxNext) { *xMaxNext = line->next->xMax; } + if (yMinNext) { *yMinNext = line->next->edge[n]; } + if (yMaxNext) { *yMaxNext = line->next->edge[0]; } + break; + } + } } } } @@ -4115,6 +4256,12 @@ bool TextPage::findText(Unicode *s, int len, ++p; } } + + if (nextline && nextline != line->next->ascii_translation && + nextline != line->next->normalized) { + gfree(nextline); + } + } } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 3ff1754a93ec116470cbf81c3140eca825799bc0..63ae8f8cd09a9dffcf53b6709dc564f8b94f6cce 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -623,6 +623,29 @@ public: double *xMin, double *yMin, double *xMax, double *yMax); + // Adds new parameter , which allows to match + // on text spanning from end of a line to the next line. In that case, + // the coords for the part of match that falls on the next line will + // be stored in ,, and , + // and if hyphenation (i.e. ignoring hyphen at end of a line) was used + // while matching at the end of first line, then will be true, + // otherwise will be false. + // Only finding across two lines is supported, i.e. it won't match where + // spans more than two lines. + // + // will be ignored if is true (as that + // combination has not been implemented yet). + bool findText(Unicode *s, int len, + bool startAtTop, bool stopAtBottom, + bool startAtLast, bool stopAtLast, + bool caseSensitive, bool ignoreDiacritics, + bool matchAcrossLines, + bool backward, bool wholeWord, + double *xMin, double *yMin, + double *xMax, double *yMax, + double *xMinNext, double *yMinNext, + double *xMaxNext, double *yMaxNext, bool *afterHyphen); + // Get the text which is inside the specified rectangle. GooString *getText(double xMin, double yMin, double xMax, double yMax); diff --git a/qt5/src/poppler-page-private.h b/qt5/src/poppler-page-private.h index d78131247f7901a71409a2367cccaa6612c50653..a81f5f2252cbb907b396ae5082d74835eab81267 100644 --- a/qt5/src/poppler-page-private.h +++ b/qt5/src/poppler-page-private.h @@ -49,8 +49,8 @@ public: static Link* convertLinkActionToLink(::LinkAction * a, DocumentData *parentDoc, const QRectF &linkArea); TextPage *prepareTextSearch(const QString &text, Page::Rotation rotate, QVector *u); - bool performSingleTextSearch(TextPage* textPage, QVector &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics); - QList performMultipleTextSearch(TextPage* textPage, QVector &u, bool sCase, bool sWords, bool sDiacritics); + bool performSingleTextSearch(TextPage* textPage, QVector &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics, bool sAcrossLines); + QList performMultipleTextSearch(TextPage* textPage, QVector &u, bool sCase, bool sWords, bool sDiacritics, bool sAcrossLines); }; } diff --git a/qt5/src/poppler-page.cc b/qt5/src/poppler-page.cc index 3e7506207d284069057d5271640950aa7dfe233f..381a149894afb9e4346272be9ff54bca4324d383 100644 --- a/qt5/src/poppler-page.cc +++ b/qt5/src/poppler-page.cc @@ -48,6 +48,7 @@ #include #include +#include #include #include #include @@ -393,28 +394,35 @@ inline TextPage *PageData::prepareTextSearch(const QString &text, Page::Rotation return textPage; } -inline bool PageData::performSingleTextSearch(TextPage* textPage, QVector &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics = false) +inline bool PageData::performSingleTextSearch(TextPage* textPage, QVector &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics = false, bool sAcrossLines = false) { if (direction == Page::FromTop) return textPage->findText( u.data(), u.size(), - true, true, false, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom ); + true, true, false, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, + nullptr, nullptr, nullptr, nullptr, nullptr ); else if ( direction == Page::NextResult ) return textPage->findText( u.data(), u.size(), - false, true, true, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom ); + false, true, true, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, + nullptr, nullptr, nullptr, nullptr, nullptr ); else if ( direction == Page::PreviousResult ) return textPage->findText( u.data(), u.size(), - false, true, true, false, sCase, sDiacritics, true, sWords, &sLeft, &sTop, &sRight, &sBottom ); + false, true, true, false, sCase, sDiacritics, sAcrossLines, true, sWords, &sLeft, &sTop, &sRight, &sBottom, + nullptr, nullptr, nullptr, nullptr, nullptr ); return false; } -inline QList PageData::performMultipleTextSearch(TextPage* textPage, QVector &u, bool sCase, bool sWords, bool sDiacritics = false) +inline QList PageData::performMultipleTextSearch(TextPage* textPage, QVector &u, bool sCase, bool sWords, bool sDiacritics = false, bool sAcrossLines = false) { QList results; double sLeft = 0.0, sTop = 0.0, sRight = 0.0, sBottom = 0.0; + double sLeftN, sTopN, sRightN, sBottomN; + bool sAfterHyphen = false; + sLeftN = DBL_MAX; // we use this to detect valid return values while(textPage->findText( u.data(), u.size(), - false, true, true, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom )) + false, true, true, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, + &sLeftN, &sTopN, &sRightN, &sBottomN, &sAfterHyphen )) { QRectF result; @@ -424,6 +432,19 @@ inline QList PageData::performMultipleTextSearch(TextPage* textPage, QVe result.setBottom(sBottom); results.append(result); + + if (sAcrossLines && sLeftN != DBL_MAX) { + QRectF resultN; + + resultN.setLeft(sLeftN); + resultN.setTop(sTopN); + resultN.setRight(sRightN); + resultN.setBottom(sBottomN); + + results.append(resultN); + + sLeftN = DBL_MAX; + } } return results; @@ -718,11 +739,12 @@ bool Page::search(const QString &text, double &sLeft, double &sTop, double &sRig const bool sCase = flags.testFlag(IgnoreCase) ? false : true; const bool sWords = flags.testFlag(WholeWords) ? true : false; const bool sDiacritics = flags.testFlag(IgnoreDiacritics) ? true : false; + const bool sAcrossLines = flags.testFlag(AcrossLines) ? true : false; QVector u; TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u); - const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, sWords, sDiacritics); + const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, sWords, sDiacritics, sAcrossLines); textPage->decRefCnt(); @@ -748,11 +770,12 @@ QList Page::search(const QString &text, SearchFlags flags, Rotation rota const bool sCase = flags.testFlag(IgnoreCase) ? false : true; const bool sWords = flags.testFlag(WholeWords) ? true : false; const bool sDiacritics = flags.testFlag(IgnoreDiacritics) ? true : false; + const bool sAcrossLines = flags.testFlag(AcrossLines) ? true : false; QVector u; TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u); - const QList results = m_page->performMultipleTextSearch(textPage, u, sCase, sWords, sDiacritics); + const QList results = m_page->performMultipleTextSearch(textPage, u, sCase, sWords, sDiacritics, sAcrossLines); textPage->decRefCnt(); diff --git a/qt5/src/poppler-qt5.h b/qt5/src/poppler-qt5.h index c03fec15ca7420bb5090e2749ab566e2acc15f6b..91e9a26276aee88a1f3b80a024b6d7d8d8fb839b 100644 --- a/qt5/src/poppler-qt5.h +++ b/qt5/src/poppler-qt5.h @@ -743,9 +743,12 @@ delete it; NoSearchFlags = 0x00000000, ///< since 0.63 IgnoreCase = 0x00000001, ///< Case differences are ignored WholeWords = 0x00000002, ///< Only whole words are matched - IgnoreDiacritics = 0x00000004 ///< Diacritic differences (eg. accents, umlauts, diaeresis) are ignored. \since 0.73 + IgnoreDiacritics = 0x00000004, ///< Diacritic differences (eg. accents, umlauts, diaeresis) are ignored. \since 0.73 ///< This option will have no effect if the search term contains characters which ///< are not pure ascii. + AcrossLines = 0x00000008 ///< Allows to match on text spanning from end of a line to the next line. + ///< It won't match on text spanning more than two lines. Automatically ignores hyphen + ///< at end of line, and allows whitespace in search term to match on newline. \since 0.77. }; Q_DECLARE_FLAGS( SearchFlags, SearchFlag ) diff --git a/qt5/tests/check_search.cpp b/qt5/tests/check_search.cpp index 7c251c5de95fa6949a983616a8964b1dd2b2eb53..56a0c7eadf5b6f136c8e847db0678ab4e5d96f1d 100644 --- a/qt5/tests/check_search.cpp +++ b/qt5/tests/check_search.cpp @@ -13,6 +13,7 @@ private slots: void testWholeWordsOnly(); void testIgnoreDiacritics(); void testRussianSearch(); // Issue #743 + void testAcrossLinesSearch(); }; void TestSearch::bug7063() @@ -260,6 +261,88 @@ void TestSearch::testRussianSearch() QCOMPARE( page->search(str, l, t, r, b, direction, mode2W), true ); } +void TestSearch::testAcrossLinesSearch() +{ + // Test for searching across lines with new flag Poppler::Page::AcrossLines + // and its automatic features like ignoring hyphen at end of line or allowing + // whitespace in the search term to match on newline character. + QScopedPointer< Poppler::Document > document(Poppler::Document::load(TESTDATADIR "/unittestcases/searchAcrossLines.pdf")); + QVERIFY( document ); + + QScopedPointer< Poppler::Page > page(document->page(1)); + QVERIFY( page ); + + const Poppler::Page::SearchDirection direction = Poppler::Page::FromTop; + + const Poppler::Page::SearchFlags empty = Poppler::Page::NoSearchFlags; + const Poppler::Page::SearchFlags mode0 = Poppler::Page::AcrossLines; + const Poppler::Page::SearchFlags mode1 = Poppler::Page::AcrossLines | Poppler::Page::IgnoreDiacritics; + const Poppler::Page::SearchFlags mode2 = Poppler::Page::AcrossLines | Poppler::Page::IgnoreDiacritics | Poppler::Page::IgnoreCase; + const Poppler::Page::SearchFlags mode2W = mode2 | Poppler::Page::WholeWords; + + double l, t, r, b; //left, top, right, bottom + + // In the searched page, each of "re-conocimiento" "PRUE-BA" "imáge-nes" happen split across lines + const QString str1 = QString::fromUtf8("reconocimiento"); //clazy:exclude=qstring-allocations + const QString str2 = QString::fromUtf8("IMagenes"); //clazy:exclude=qstring-allocations + // Test it cannot be found with empty search flags + QCOMPARE( page->search(str1, l, t, r, b, direction, empty), false ); + // Test it is found with AcrossLines option + QCOMPARE( page->search(str1, l, t, r, b, direction, mode0), true ); + // Test AcrossLines with IgnoreDiacritics and IgnoreCase options + QCOMPARE( page->search(str2, l, t, r, b, direction, mode0), false ); + QCOMPARE( page->search(str2, l, t, r, b, direction, mode1), false ); + QCOMPARE( page->search(str2, l, t, r, b, direction, mode2), true ); + // Test with WholeWords too + QCOMPARE( page->search(str2, l, t, r, b, direction, mode2W),true ); + + // Now test that AcrossLines also allows whitespace in the search term to match on newline char. + // In the searched page, "podrá" ends a line and "acordar" starts the next line, so we + // now test we match it with "podrá acordar" + const QString str3 = QString::fromUtf8("podrá acordar,"); //clazy:exclude=qstring-allocations + QCOMPARE( page->search(str3, l, t, r, b, direction, mode0), true ); + QCOMPARE( page->search(str3, l, t, r, b, direction, mode1), true ); + QCOMPARE( page->search(str3, l, t, r, b, direction, mode2), true ); + QCOMPARE( page->search(str3, l, t, r, b, direction, mode2W),true ); + // now test it also works with IgnoreDiacritics and IgnoreCase + const QString str4 = QString::fromUtf8("PODRA acordar"); //clazy:exclude=qstring-allocations + QCOMPARE( page->search(str4, l, t, r, b, direction, mode0), false ); + QCOMPARE( page->search(str4, l, t, r, b, direction, mode1), false ); + QCOMPARE( page->search(str4, l, t, r, b, direction, mode2), true ); + QCOMPARE( page->search(str4, l, t, r, b, direction, mode2W),false ); //false as it lacks ending comma + + // Now test that when a hyphen char in the search term matches a hyphen at end of line, + // then we don't automatically ignore it, but treat it as a normal char. + // In the searched page, "CC BY-NC-SA 4.0" is split across two lines on the second hyphen + const QString str5 = QString::fromUtf8("CC BY-NC-SA 4.0"); //clazy:exclude=qstring-allocations + QScopedPointer< Poppler::Page > page0(document->page(0)); + QVERIFY( page0 ); + QCOMPARE( page0->search(str5, l, t, r, b, direction, mode0), true ); + QCOMPARE( page0->search(str5, l, t, r, b, direction, mode1), true ); + QCOMPARE( page0->search(str5, l, t, r, b, direction, mode2), true ); + QCOMPARE( page0->search(str5, l, t, r, b, direction, mode2W),true ); + QCOMPARE( page0->search(QString::fromUtf8("NC-SA"), l, t, r, b, direction, mode2W), false ); + // Searching for "CC BY-NCSA 4.0" should also match, because hyphen is now ignored at end of line + const QString str6 = QString::fromUtf8("CC BY-NCSA 4.0"); //clazy:exclude=qstring-allocations + QCOMPARE( page0->search(str6, l, t, r, b, direction, mode0), true ); + QCOMPARE( page0->search(str6, l, t, r, b, direction, mode1), true ); + QCOMPARE( page0->search(str6, l, t, r, b, direction, mode2), true ); + QCOMPARE( page0->search(str6, l, t, r, b, direction, mode2W),true ); + + // Now for completeness, we will match the full text of two lines + const QString full2lines = QString::fromUtf8("Las pruebas se practicarán en vista pública, si bien, excepcionalmente, el Tribunal podrá acordar, mediante providencia, que determinadas pruebas se celebren fuera del acto de juicio"); //clazy:exclude=qstring-allocations + QCOMPARE( page->search(full2lines, l, t, r, b, direction, mode0), true ); + QCOMPARE( page->search(full2lines, l, t, r, b, direction, mode1), true ); + QCOMPARE( page->search(full2lines, l, t, r, b, direction, mode2), true ); + QCOMPARE( page->search(full2lines, l, t, r, b, direction, mode2W),true ); + // And now the full text of two lines split by a hyphenated word + const QString full2linesHyphenated = QString::fromUtf8("Consiste básicamente en información digitalizada, codificados y alojados en un elemento contenedor digital (equipos, dispositivos periféricos, unidades de memoria, unidades virtualizadas, tramas"); //clazy:exclude=qstring-allocations + QCOMPARE( page->search(full2linesHyphenated, l, t, r, b, direction, mode0), true ); + QCOMPARE( page->search(full2linesHyphenated, l, t, r, b, direction, mode1), true ); + QCOMPARE( page->search(full2linesHyphenated, l, t, r, b, direction, mode2), true ); + QCOMPARE( page->search(full2linesHyphenated, l, t, r, b, direction, mode2W),true ); +} + QTEST_GUILESS_MAIN(TestSearch) #include "check_search.moc"