Commit 17b26233 authored by Adrián Pérez de Castro's avatar Adrián Pérez de Castro Committed by Carlos Garcia Campos
Browse files

Tagged-PDF: Text content extraction from structure elements

Implement StructElement::getText(), by using MCOutputDev. This output device
captures pieces of text (aka "spans") which have the same attributes into
a list of TextSpan objects.

https://bugs.freedesktop.org/show_bug.cgi?id=64815
parent 2c4320c2
......@@ -232,6 +232,7 @@ poppler_include_HEADERS = \
NameToUnicodeTable.h \
PSOutputDev.h \
TextOutputDev.h \
MarkedContentOutputDev.h \
SecurityHandler.h \
UTF.h \
UTF8.h \
......@@ -306,6 +307,7 @@ libpoppler_la_SOURCES = \
XRef.cc \
PSOutputDev.cc \
TextOutputDev.cc \
MarkedContentOutputDev.cc \
PageLabelInfo.h \
PageLabelInfo.cc \
SecurityHandler.cc \
......
//========================================================================
//
// MarkedContentOutputDev.cc
//
// Copyright 2013 Igalia S.L.
//
//========================================================================
#include "MarkedContentOutputDev.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
#include "GfxState.h"
#include "GfxFont.h"
#include "Annot.h"
#include <vector>
MarkedContentOutputDev::MarkedContentOutputDev(int mcidA):
currentFont(NULL),
currentText(NULL),
mcid(mcidA),
pageWidth(0.0),
pageHeight(0.0),
unicodeMap(NULL)
{
currentColor.r = currentColor.g = currentColor.b = 0;
}
MarkedContentOutputDev::~MarkedContentOutputDev()
{
if (unicodeMap)
unicodeMap->decRefCnt();
if (currentFont)
currentFont->decRefCnt();
delete currentText;
}
void MarkedContentOutputDev::endSpan()
{
if (currentText && currentText->getLength()) {
// The TextSpan takes ownership of currentText and
// increases the reference count for currentFont.
textSpans.push_back(TextSpan(currentText,
currentFont,
currentColor));
}
currentText = NULL;
}
void MarkedContentOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
{
if (state) {
pageWidth = state->getPageWidth();
pageHeight = state->getPageHeight();
} else {
pageWidth = pageHeight = 0.0;
}
}
void MarkedContentOutputDev::endPage()
{
pageWidth = pageHeight = 0.0;
}
void MarkedContentOutputDev::beginMarkedContent(char *name, Dict *properties)
{
int id = -1;
if (properties)
properties->lookupInt("MCID", NULL, &id);
if (id == -1)
return;
// The stack keep track of MCIDs of nested marked content.
if (inMarkedContent() || id == mcid)
mcidStack.push_back(id);
}
void MarkedContentOutputDev::endMarkedContent(GfxState *state)
{
if (inMarkedContent()) {
mcidStack.pop_back();
// The outer marked content sequence MCID was popped, ensure
// that the last piece of text collected ends up in a TextSpan.
if (!inMarkedContent())
endSpan();
}
}
bool MarkedContentOutputDev::needFontChange(GfxFont* font) const
{
if (currentFont == font)
return gFalse;
if (!currentFont)
return font != NULL && font->isOk();
if (font == NULL)
return gTrue;
// Two non-null valid fonts are the same if they point to the same Ref
if (currentFont->getID()->num == font->getID()->num &&
currentFont->getID()->gen == font->getID()->gen)
return gFalse;
return gTrue;
}
void MarkedContentOutputDev::drawChar(GfxState *state,
double xx, double yy,
double dx, double dy,
double ox, double oy,
CharCode c, int nBytes,
Unicode *u, int uLen)
{
if (!inMarkedContent() || !uLen)
return;
// Color changes are tracked here so the color can be chosen depending on
// the render mode (for mode 1 stroke color is used), so there is no need
// to implement both updateFillColor() and updateStrokeColor().
GBool colorChange = gFalse;
GfxRGB color;
if ((state->getRender() & 3) == 1)
state->getStrokeRGB(&color);
else
state->getFillRGB(&color);
colorChange = (color.r != currentColor.r ||
color.g != currentColor.g ||
color.b != currentColor.b);
// Check also for font changes.
GBool fontChange = needFontChange(state->getFont());
// Save a span with the current changes.
if (colorChange || fontChange) {
endSpan();
}
// Perform the color/font changes.
if (colorChange)
currentColor = color;
if (fontChange) {
if (currentFont != NULL) {
currentFont->decRefCnt();
currentFont = NULL;
}
if (state->getFont() != NULL) {
currentFont = state->getFont();
currentFont->incRefCnt();
}
}
double sp, dx2, dy2, w1, h1, x1, y1;
// Subtract char and word spacing from the (dx,dy) values
sp = state->getCharSpace();
if (c == (CharCode) 0x20)
sp += state->getWordSpace();
state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
dx -= dx2;
dy -= dy2;
state->transformDelta(dx, dy, &w1, &h1);
state->transform(xx, yy, &x1, &y1);
// Throw away characters that are not inside the page boundaries.
if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight)
return;
// Make a sanity check on character size. Note: (x != x) <-> isnan(x)
if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1)
return;
for (int i = 0; i < uLen; i++) {
// Soft hyphen markers are skipped, as they are invisible unless
// rendering is done to an actual device and the hyphenation hint
// used. MarkedContentOutputDev extracts the *visible* text content.
if (u[i] != 0x00AD) {
// Add the UTF-8 sequence to the current text span.
if (!unicodeMap)
unicodeMap = globalParams->getTextEncoding();
char buf[8];
int n = unicodeMap->mapUnicode(u[i], buf, sizeof(buf));
if (n > 0) {
if (currentText == NULL)
currentText = new GooString();
currentText->append(buf, n);
}
}
}
}
const TextSpanArray& MarkedContentOutputDev::getTextSpans() const
{
return textSpans;
}
//========================================================================
//
// MarkedContentOutputDev.h
//
// Copyright 2013 Igalia S.L.
//
//========================================================================
#ifndef MARKEDCONTENTOUTPUTDEV_H
#define MARKEDCONTENTOUTPUTDEV_H
#include "goo/gtypes.h"
#include "goo/gmem.h"
#include "OutputDev.h"
#include "GfxState.h"
#include "GfxFont.h"
#include <vector>
class Dict;
class UnicodeMap;
class TextSpan {
public:
TextSpan(const TextSpan& other): data(other.data) {
data->refcount++;
}
TextSpan& operator=(const TextSpan& other) {
if (this != &other) {
data = other.data;
data->refcount++;
}
return *this;
}
~TextSpan() {
if (data && --data->refcount == 0)
delete data;
}
GfxFont* getFont() const { return data->font; }
GooString* getText() const { return data->text; }
GfxRGB& getColor() const { return data->color; }
private:
// Note: Takes ownership of strings, increases refcount for font.
TextSpan(GooString *text,
GfxFont *font,
const GfxRGB& color)
: data(new Data) {
data->text = text;
data->font = font;
data->color = color;
if (data->font)
data->font->incRefCnt();
}
struct Data {
GfxFont *font;
GooString *text;
GfxRGB color;
unsigned refcount;
Data(): refcount(1) {}
~Data() {
assert(refcount == 0);
if (font)
font->decRefCnt();
delete text;
}
};
Data *data;
friend class MarkedContentOutputDev;
};
typedef std::vector<TextSpan> TextSpanArray;
class MarkedContentOutputDev: public OutputDev {
public:
MarkedContentOutputDev(int mcidA);
virtual ~MarkedContentOutputDev();
virtual GBool isOk() { return gTrue; }
virtual GBool upsideDown() { return gTrue; }
virtual GBool useDrawChar() { return gTrue; }
virtual GBool interpretType3Chars() { return gFalse; }
virtual GBool needNonText() { return gFalse; }
virtual GBool needCharCount() { return gFalse; }
virtual void startPage(int pageNum, GfxState *state, XRef *xref);
virtual void endPage();
virtual void drawChar(GfxState *state,
double xx, double yy,
double dx, double dy,
double ox, double oy,
CharCode c, int nBytes,
Unicode *u, int uLen);
virtual void beginMarkedContent(char *name, Dict *properties);
virtual void endMarkedContent(GfxState *state);
const TextSpanArray& getTextSpans() const;
private:
void endSpan();
bool inMarkedContent() const { return mcidStack.size() > 0; }
bool needFontChange(GfxFont* font) const;
GfxFont *currentFont;
GooString *currentText;
GfxRGB currentColor;
TextSpanArray textSpans;
int mcid;
std::vector<int> mcidStack;
double pageWidth;
double pageHeight;
UnicodeMap *unicodeMap;
};
#endif /* !MARKEDCONTENTOUTPUTDEV_H */
......@@ -14,6 +14,8 @@
#include "StructElement.h"
#include "StructTreeRoot.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
#include "PDFDoc.h"
#include "Dict.h"
......@@ -981,6 +983,54 @@ const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBo
return NULL;
}
GooString* StructElement::appendSubTreeText(GooString *string, GBool recursive) const
{
if (isContent() && !isObjectRef()) {
MarkedContentOutputDev mcdev(getMCID());
const TextSpanArray& spans(getTextSpansInternal(mcdev));
if (!string)
string = new GooString();
for (TextSpanArray::const_iterator i = spans.begin(); i != spans.end(); ++i)
string->append(i->getText());
return string;
}
if (!recursive)
return NULL;
// Do a depth-first traversal, to get elements in logical order
if (!string)
string = new GooString();
for (unsigned i = 0; i < getNumElements(); i++)
getElement(i)->appendSubTreeText(string, recursive);
return string;
}
const TextSpanArray& StructElement::getTextSpansInternal(MarkedContentOutputDev& mcdev) const
{
assert(isContent());
int startPage = 0, endPage = 0;
Ref ref;
if (getPageRef(ref)) {
startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen);
}
if (!(startPage && endPage)) {
startPage = 1;
endPage = treeRoot->getDoc()->getNumPages();
}
treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse);
return mcdev.getTextSpans();
}
static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved)
{
// Circular reference
......
......@@ -17,6 +17,7 @@
#include "goo/gtypes.h"
#include "goo/GooString.h"
#include "MarkedContentOutputDev.h"
#include "Object.h"
#include <vector>
#include <set>
......@@ -218,9 +219,36 @@ public:
const GooString *getActualText() const { return isContent() ? NULL : s->actualText; }
GooString *getActualText() { return isContent() ? NULL : s->actualText; }
// Content text referenced by the element:
//
// - For MCID reference elements, this is just the text of the
// corresponding marked content object in the page stream, regardless
// of the setting of the "recursive" flag.
// - For other elements, if the "recursive" flag is set, the text
// enclosed by *all* the child MCID reference elements of the subtree
// is returned. The text is assembled by traversing the leaf MCID
// reference elements in logical order.
// - In any other case, the function returns NULL.
//
// A new string is returned, and the ownership passed to the caller.
//
GooString *getText(GBool recursive = gTrue) const {
return appendSubTreeText(NULL, recursive);
}
const TextSpanArray getTextSpans() const {
if (!isContent())
return TextSpanArray();
MarkedContentOutputDev mcdev(getMCID());
return getTextSpansInternal(mcdev);
}
~StructElement();
private:
GooString* appendSubTreeText(GooString *string, GBool recursive) const;
const TextSpanArray& getTextSpansInternal(MarkedContentOutputDev& mcdev) const;
typedef std::vector<Attribute*> AttrPtrArray;
typedef std::vector<StructElement*> ElemPtrArray;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment