Commit 60400514 authored by suzuki toshiya's avatar suzuki toshiya Committed by Albert Astals Cid

[cpp] Add the font infos to the text_box object.

parent bf33c25b
......@@ -38,6 +38,7 @@ install(FILES
poppler-document.h
poppler-embedded-file.h
poppler-font.h
poppler-font-private.h
poppler-global.h
poppler-image.h
poppler-page.h
......
/*
* Copyright (C) 2009, Pino Toscano <pino@kde.org>
* Copyright (C) 2015, Tamas Szekeres <szekerest@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include "poppler-font.h"
#include "poppler-document-private.h"
#include "FontInfo.h"
#include <algorithm>
using namespace poppler;
class poppler::font_info_private
{
public:
font_info_private()
: type(font_info::unknown)
, is_embedded(false)
, is_subset(false)
{
}
font_info_private(FontInfo *fi)
: type((font_info::type_enum)fi->getType())
, is_embedded(fi->getEmbedded())
, is_subset(fi->getSubset())
{
if (fi->getName()) {
font_name = fi->getName()->c_str();
}
if (fi->getFile()) {
font_file = fi->getFile()->c_str();
}
ref.num = fi->getRef().num;
ref.gen = fi->getRef().gen;
emb_ref.num = fi->getEmbRef().num;
emb_ref.gen = fi->getEmbRef().gen;
}
std::string font_name;
std::string font_file;
font_info::type_enum type : 5;
bool is_embedded : 1;
bool is_subset : 1;
Ref ref;
Ref emb_ref;
};
class poppler::font_iterator_private
{
public:
font_iterator_private(int start_page, document_private *dd)
: font_info_scanner(dd->doc, start_page)
, total_pages(dd->doc->getNumPages())
, current_page((std::max)(start_page, 0))
{
}
~font_iterator_private()
{
}
FontInfoScanner font_info_scanner;
int total_pages;
int current_page;
};
......@@ -24,6 +24,8 @@
*/
#include "poppler-font.h"
#include "poppler-font-private.h"
#include "poppler-document-private.h"
#include "FontInfo.h"
......@@ -32,54 +34,6 @@
using namespace poppler;
class poppler::font_info_private
{
public:
font_info_private()
: type(font_info::unknown)
, is_embedded(false)
, is_subset(false)
{
}
font_info_private(FontInfo *fi)
: type((font_info::type_enum)fi->getType())
, is_embedded(fi->getEmbedded())
, is_subset(fi->getSubset())
{
if (fi->getName()) {
font_name = fi->getName()->c_str();
}
if (fi->getFile()) {
font_file = fi->getFile()->c_str();
}
}
std::string font_name;
std::string font_file;
font_info::type_enum type : 5;
bool is_embedded : 1;
bool is_subset : 1;
};
class poppler::font_iterator_private
{
public:
font_iterator_private(int start_page, document_private *dd)
: font_info_scanner(dd->doc, start_page)
, total_pages(dd->doc->getNumPages())
, current_page((std::max)(start_page, 0))
{
}
~font_iterator_private()
{
}
FontInfoScanner font_info_scanner;
int total_pages;
int current_page;
};
/**
\class poppler::font_info poppler-font.h "poppler/cpp/poppler-font.h"
......@@ -208,7 +162,7 @@ font_iterator::~font_iterator()
}
/**
Returns the fonts of the current page and advances to the next one.
\returns the fonts of the current page and advances to the next one.
*/
std::vector<font_info> font_iterator::next()
{
......@@ -218,6 +172,10 @@ std::vector<font_info> font_iterator::next()
++d->current_page;
/* FontInfoScanner::scan() receives a number how many pages to
* be scanned from the *current page*, not from the beginning.
* We restrict the font scanning to the current page only.
*/
const std::vector<FontInfo*> items = d->font_info_scanner.scan(1);
std::vector<font_info> fonts;
fonts.reserve(items.size());
......
......@@ -67,6 +67,7 @@ private:
font_info_private *d;
friend class font_iterator;
friend class page;
};
......@@ -84,6 +85,8 @@ private:
font_iterator_private *d;
friend class document;
friend class page;
friend class page_private;
};
}
......
......@@ -29,6 +29,7 @@ namespace poppler
class document_private;
class page_transition;
class font_info;
class page_private
{
......@@ -46,6 +47,9 @@ public:
static inline page_private* get(const poppler::page *p)
{ return const_cast<poppler::page *>(p)->d; }
std::vector<font_info> font_info_cache;
size_t init_font_info_cache();
};
}
......
......@@ -32,6 +32,8 @@
#include "poppler-document-private.h"
#include "poppler-page-private.h"
#include "poppler-private.h"
#include "poppler-font-private.h"
#include "poppler-font.h"
#include "TextOutputDev.h"
......@@ -54,6 +56,22 @@ page_private::~page_private()
delete transition;
}
size_t page_private::init_font_info_cache()
{
if (font_info_cache.size() > 0)
return font_info_cache.size();
poppler::font_iterator* font_iterator = new poppler::font_iterator(index, doc);
if (font_iterator->has_next()) {
font_info_cache = font_iterator->next();
}
delete font_iterator;
return font_info_cache.size();
}
/**
\class poppler::page poppler-page.h "poppler/cpp/poppler-page.h"
......@@ -334,17 +352,39 @@ bool text_box::has_space_after() const
return m_data->has_space_after;
}
int text_box::get_wmode(int i) const
{
return m_data->wmodes[i];
}
double text_box::get_font_size() const
{
return m_data->font_size;
}
std::string text_box::get_font_name(int i) const
{
int j = m_data->glyph_to_cache_index[i];
if (j < 0) {
return std::string("");
}
return m_data->font_info_cache[j].name();
}
std::vector<text_box> page::text_list() const
{
d->init_font_info_cache();
std::vector<text_box> output_list;
/* config values are same with Qt5 Page::TextList() */
auto output_dev = std::make_unique<TextOutputDev>(
nullptr, /* char* fileName */
false, /* bool physLayoutA */
nullptr, /* char* fileName */
false, /* bool physLayoutA */
0, /* double fixedPitchA */
false, /* bool rawOrderA */
false /* bool append */
false, /* bool rawOrderA */
false /* bool append */
);
/*
......@@ -378,7 +418,11 @@ std::vector<text_box> page::text_list() const
{xMin, yMin, xMax-xMin, yMax-yMin},
word->getRotation(),
{},
word->hasSpaceAfter() == true
word->hasSpaceAfter() == true,
{},
word->getFontSize(),
d->font_info_cache,
{}
}};
tb.m_data->char_bboxes.reserve(word->getLength());
......@@ -387,6 +431,20 @@ std::vector<text_box> page::text_list() const
tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin);
}
tb.m_data->glyph_to_cache_index.reserve(word->getLength());
for (int j = 0; j < word->getLength(); j++) {
const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
tb.m_data->wmodes.push_back(cur_text_font_info->getWMode());
tb.m_data->glyph_to_cache_index[j] = -1;
for (size_t k = 0; k < d->font_info_cache.size(); k++) {
if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) {
tb.m_data->glyph_to_cache_index[j] = k;
break;
}
}
}
output_list.push_back(std::move(tb));
}
}
......
......@@ -65,6 +65,56 @@ public:
*/
rectf char_bbox(size_t i) const;
bool has_space_after() const;
/**
\since 0.8x
*/
/**
Get a writing mode for the i-th glyph
This method returns an integer of the writing mode
for the i-th glyph in the text_box.
0 means the horizontal writing mode.
1 means the vertical writing mode.
\note Usually all glyphs in one text_box have the
same writing mode. Thus the default value of the
glyph index is 0.
*/
int get_wmode(int i = 0) const;
/**
Get a font size of this text_box instance.
This method return a double floating value of the
font size from the text_box instance.
*/
double get_font_size() const;
/**
Get a font name for the i-th glyph
This method returns a std::string object holding
the font name for the i-th glyph.
\note The randomization prefix of the embedded fonts
are not removed. The font names including these
prefixes are insuffucient to determine whether the
two fonts are same or different.
\note The clients should not assume that the
encoding of the font name is one of the ASCII,
Latin1 or UTF-8. Some legacy PDF producers used
in CJK market use GBK, Big5, Wansung or Shift-JIS.
\warning The returned std::string is owned by the
text_box instance, it should not be used in the
other objects or should not be destroyed directly.
*/
std::string get_font_name(int i = 0) const;
private:
text_box(text_box_data *data);
......
......@@ -71,6 +71,7 @@ void delete_all(const Collection &c)
delete_all(c.begin(), c.end());
}
class font_info;
struct text_box_data
{
~text_box_data();
......@@ -80,6 +81,27 @@ struct text_box_data
int rotation;
std::vector<rectf> char_bboxes;
bool has_space_after;
std::vector<int> wmodes;
double font_size;
/*
* a duplication of the font_info_cache created by the
* poppler::font_iterator and owned by the poppler::page
* object. Its lifetime might differ from that of text_box
* object (think about collecting all text_box objects
* from all pages), so we have to duplicate it into all
* text_box instances.
*/
std::vector<font_info> font_info_cache;
/*
* a std::vector from the glyph index in the current
* text_box to the font_info index in font_info_cache.
* The "-1" means no corresponding fonts found in the
* cache.
*/
std::vector<int> glyph_to_cache_index;
};
}
......
......@@ -430,8 +430,12 @@ static void print_page_text_list(poppler::page *p)
for (const poppler::text_box &text : text_list) {
poppler::rectf bbox = text.bbox();
poppler::ustring ustr = text.text();
int wmode = text.get_wmode();
double font_size = text.get_font_size();
std::string font_name = text.get_font_name();
std::cout << "[" << ustr << "] @ ";
std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
std::cout << std::endl;
}
......
......@@ -341,6 +341,10 @@ bool TextFontInfo::matches(const TextFontInfo *fontInfo) const {
return gfxFont == fontInfo->gfxFont;
}
bool TextFontInfo::matches(const Ref *ref) const {
return (gfxFont->getID()->num == ref->num && gfxFont->getID()->gen == ref->gen);
}
double TextFontInfo::getAscent() const {
return gfxFont ? gfxFont->getAscent() : 0.95;
}
......
......@@ -91,6 +91,7 @@ public:
bool matches(const GfxState *state) const;
bool matches(const TextFontInfo *fontInfo) const;
bool matches(const Ref *ref) const;
// Get the font ascent, or a default value if the font is not set
double getAscent() const;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment