Commit 31893320 authored by suzuki toshiya's avatar suzuki toshiya Committed by Albert Astals Cid

[cpp] separate the font info in text_box to another struct.

* add new API, page::text_list(int opt_flag). The old one
taking no argument is kept for ABI compatibility.
The opt_flag is a bitmask-multiple of the new enum,
page::text_list_option_enum.

* text_box.m_data->text_box_font is an unique pointer to
the storage (if text_list() requests the font info), or
just a null pointer (if text_list() does not request the
font info).

* new option "--show-text-list-with-font" showing font
info, to tests/poppler-dump.cpp. "--show-text-list"
does not load the font info at all.
Co-authored-by: Adam Reichold's avatarAdam Reichold <adam.reichold@t-online.de>
Co-authored-by: Albert Astals Cid's avatarAlbert Astals Cid <aacid@kde.org>
parent 437553ec
......@@ -299,7 +299,7 @@ static void appendToGooString(void *stream, const char *text, int len) {
ustring page::text(const rectf &r, text_layout_enum layout_mode) const
{
std::unique_ptr<GooString> out(new GooString());
const bool use_raw_order = (layout_mode == raw_order_layout);
const bool use_raw_order = (layout_mode == raw_order_layout);
const bool use_physical_layout = (layout_mode == physical_layout);
TextOutputDev td(&appendToGooString, out.get(), use_physical_layout, 0, use_raw_order, false);
if (r.is_empty()) {
......@@ -311,6 +311,11 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
return ustring::from_utf8(out->c_str());
}
/*
* text_box_font_info object for text_box
*/
text_box_font_info_data::~text_box_font_info_data() = default;
/*
* text_box object for page::text_list()
*/
......@@ -352,30 +357,41 @@ bool text_box::has_space_after() const
return m_data->has_space_after;
}
bool text_box::has_font_info() const
{
return (m_data->text_box_font != nullptr);
}
text_box::writing_mode_enum text_box::get_wmode(int i) const
{
return m_data->wmodes[i];
if (this->has_font_info())
return m_data->text_box_font->wmodes[i];
else
return text_box::invalid_wmode;
}
double text_box::get_font_size() const
{
return m_data->font_size;
if (this->has_font_info())
return m_data->text_box_font->font_size;
else
return -1;
}
std::string text_box::get_font_name(int i) const
{
int j = m_data->glyph_to_cache_index[i];
if (!this->has_font_info())
return std::string("*ignored*");
int j = m_data->text_box_font->glyph_to_cache_index[i];
if (j < 0) {
return std::string("");
}
return m_data->font_info_cache[j].name();
return m_data->text_box_font->font_info_cache[j].name();
}
std::vector<text_box> page::text_list() const
std::vector<text_box> page::text_list(int opt_flag) const
{
d->init_font_info_cache();
std::vector<text_box> output_list;
/* config values are same with Qt5 Page::TextList() */
......@@ -419,41 +435,55 @@ std::vector<text_box> page::text_list() const
word->getRotation(),
{},
word->hasSpaceAfter() == true,
{},
word->getFontSize(),
d->font_info_cache,
{}
nullptr
}};
std::unique_ptr<text_box_font_info_data> tb_font_info = nullptr;
if (opt_flag & page::text_list_include_font) {
d->init_font_info_cache();
std::unique_ptr<text_box_font_info_data> tb_font{new text_box_font_info_data{
word->getFontSize(), // double font_size
{}, // std::vector<text_box::writing_mode> wmodes;
d->font_info_cache, // std::vector<font_info> font_info_cache;
{} // std::vector<int> glyph_to_cache_index;
}};
tb_font_info = std::move(tb_font);
};
tb.m_data->char_bboxes.reserve(word->getLength());
for (int j = 0; j < word->getLength(); j ++) {
word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax);
tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin);
}
tb.m_data->glyph_to_cache_index.reserve(word->getLength());
for (int j = 0; j < word->getLength(); j++) {
const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
// filter-out the invalid WMode value here.
switch (cur_text_font_info->getWMode()) {
case 0:
tb.m_data->wmodes.push_back(text_box::horizontal_wmode);
break;
case 1:
tb.m_data->wmodes.push_back(text_box::vertical_wmode);
break;
default:
tb.m_data->wmodes.push_back(text_box::invalid_wmode);
};
tb.m_data->glyph_to_cache_index[j] = -1;
for (size_t k = 0; k < d->font_info_cache.size(); k++) {
if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) {
tb.m_data->glyph_to_cache_index[j] = k;
if (tb_font_info && d->font_info_cache_initialized) {
tb_font_info->glyph_to_cache_index.reserve(word->getLength());
for (int j = 0; j < word->getLength(); j++) {
const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
// filter-out the invalid WMode value here.
switch (cur_text_font_info->getWMode()) {
case 0:
tb_font_info->wmodes.push_back(text_box::horizontal_wmode);
break;
case 1:
tb_font_info->wmodes.push_back(text_box::vertical_wmode);
break;
default:
tb_font_info->wmodes.push_back(text_box::invalid_wmode);
};
tb_font_info->glyph_to_cache_index[j] = -1;
for (size_t k = 0; k < tb_font_info->font_info_cache.size(); k++) {
if (cur_text_font_info->matches(&(tb_font_info->font_info_cache[k].d->ref))) {
tb_font_info->glyph_to_cache_index[j] = k;
break;
}
}
}
tb.m_data->text_box_font = std::move(tb_font_info);
}
output_list.push_back(std::move(tb));
......@@ -462,3 +492,8 @@ std::vector<text_box> page::text_list() const
return output_list;
}
std::vector<text_box> page::text_list() const
{
return text_list(0);
}
......@@ -66,6 +66,12 @@ public:
rectf char_bbox(size_t i) const;
bool has_space_after() const;
/**
\since 0.89
*/
bool has_font_info() const;
/**
Get a writing mode for the i-th glyph
......@@ -186,6 +192,22 @@ public:
*/
std::vector<text_box> text_list() const;
/*
* text_list_option_enum is a bitmask-style flags for text_list(),
* 0 means the default & simplest behaviour.
*/
enum text_list_option_enum {
text_list_include_font = 1 // \since 0.89
};
/**
Extended version of text_list() taking an option flag.
The option flag should be the multiple of text_list_option_enum.
\since 0.89
*/
std::vector<text_box> text_list(int opt_flag) const;
private:
page(document_private *doc, int index);
......
......@@ -73,23 +73,17 @@ void delete_all(const Collection &c)
}
class font_info;
struct text_box_data
struct text_box_font_info_data
{
~text_box_data();
ustring text;
rectf bbox;
int rotation;
std::vector<rectf> char_bboxes;
bool has_space_after;
~text_box_font_info_data();
std::vector<text_box::writing_mode_enum> wmodes;
double font_size;
std::vector<text_box::writing_mode_enum> wmodes;
/*
* a duplication of the font_info_cache created by the
* poppler::font_iterator and owned by the poppler::page
* object. Its lifetime might differ from that of text_box
* object. Its lifetime might differ from that of text_box
* object (think about collecting all text_box objects
* from all pages), so we have to duplicate it into all
* text_box instances.
......@@ -97,7 +91,7 @@ struct text_box_data
std::vector<font_info> font_info_cache;
/*
* a std::vector from the glyph index in the current
* a std::vector from the glyph index in the owner
* text_box to the font_info index in font_info_cache.
* The "-1" means no corresponding fonts found in the
* cache.
......@@ -105,6 +99,20 @@ struct text_box_data
std::vector<int> glyph_to_cache_index;
};
class font_info;
struct text_box_data
{
~text_box_data();
ustring text;
rectf bbox;
int rotation;
std::vector<rectf> char_bboxes;
bool has_space_after;
std::unique_ptr<text_box_font_info_data> text_box_font;
};
}
#endif
......@@ -60,6 +60,7 @@ bool show_help = false;
bool show_version = false;
char show_text[32];
bool show_text_list = false;
bool show_text_list_with_font = false;
poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
static const ArgDesc the_args[] = {
......@@ -85,6 +86,8 @@ static const ArgDesc the_args[] = {
"show text (physical|raw|none) extracted from all pages" },
{ "--show-text-list", argFlag, &show_text_list, 0,
"show text list (experimental)" },
{ "--show-text-list-with-font", argFlag, &show_text_list_with_font, 0,
"show text list with font info (experimental)" },
{ "-h", argFlag, &show_help, 0,
"print usage information" },
{ "--help", argFlag, &show_help, 0,
......@@ -417,14 +420,14 @@ static void print_page_text(poppler::page *p)
std::cout << std::endl;
}
static void print_page_text_list(poppler::page *p)
static void print_page_text_list(poppler::page *p, int opt_flag = 0)
{
if (!p) {
std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
std::cout << std::endl;
return;
}
auto text_list = p->text_list();
auto text_list = p->text_list(opt_flag);
std::cout << "---" << std::endl;
for (const poppler::text_box &text : text_list) {
......@@ -435,9 +438,9 @@ static void print_page_text_list(poppler::page *p)
std::string font_name = text.get_font_name();
std::cout << "[" << ustr << "] @ ";
std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
if (text.has_font_info())
std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
std::cout << std::endl;
}
std::cout << "---" << std::endl;
}
......@@ -538,12 +541,15 @@ int main(int argc, char *argv[])
print_page_text(p.get());
}
}
if (show_text_list) {
if (show_text_list || show_text_list_with_font) {
const int pages = doc->pages();
for (int i = 0; i < pages; ++i) {
std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
std::unique_ptr<poppler::page> p(doc->create_page(i));
print_page_text_list(p.get());
if (show_text_list_with_font)
print_page_text_list(p.get(), poppler::page::text_list_include_font);
else
print_page_text_list(p.get(), 0);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment