Commit 54f799e6 authored by Dan Shea's avatar Dan Shea Committed by Albert Astals Cid

Add pdftotext -nodiag flag to remove diagonal text on output

parent d706a9ae
......@@ -177,6 +177,10 @@
#define combMaxMidDelta 0.3
#define combMaxBaseDelta 0.4
// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
// (Or 1/tan(angle) for 90/270 degrees.)
#define diagonalThreshold 0.1
namespace {
inline bool isAscii7 (Unicode uchar) {
......@@ -2357,11 +2361,12 @@ TextWord *TextWordList::get(int idx) {
// TextPage
//------------------------------------------------------------------------
TextPage::TextPage(bool rawOrderA) {
TextPage::TextPage(bool rawOrderA, bool discardDiagA) {
int rot;
refCnt = 1;
rawOrder = rawOrderA;
discardDiag = discardDiagA;
curWord = nullptr;
charPos = 0;
curFont = nullptr;
......@@ -2384,6 +2389,7 @@ TextPage::TextPage(bool rawOrderA) {
underlines = new std::vector<TextUnderline*>();
links = new std::vector<TextLink*>();
mergeCombining = true;
diagonal = false;
}
TextPage::~TextPage() {
......@@ -2470,6 +2476,7 @@ void TextPage::clear() {
}
delete links;
diagonal = false;
curWord = nullptr;
charPos = 0;
curFont = nullptr;
......@@ -2592,6 +2599,11 @@ void TextPage::beginWord(GfxState *state) {
} else {
rot = (m[2] > 0) ? 1 : 3;
}
if (fabs(m[0]) >= fabs(m[1])) {
diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]);
} else {
diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]);
}
// for vertical writing mode, the lines are effectively rotated 90
// degrees
......@@ -2720,6 +2732,12 @@ void TextPage::addChar(GfxState *state, double x, double y,
beginWord(state);
}
// throw away diagonal chars
if (discardDiag && diagonal) {
charPos += nBytes;
return;
}
// page rotation and/or transform matrices can cause text to be
// drawn in reverse order -- in this case, swap the begin/end
// coordinates and break text into individual chars
......@@ -2729,6 +2747,13 @@ void TextPage::addChar(GfxState *state, double x, double y,
(curWord->rot == 3 && h1 > 0)) {
endWord();
beginWord(state);
// throw away diagonal chars
if (discardDiag && diagonal) {
charPos += nBytes;
return;
}
x1 += w1;
y1 += h1;
w1 = -w1;
......@@ -5648,11 +5673,12 @@ static void TextOutputDev_outputToFile(void *stream, const char *text, int len)
TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA,
double fixedPitchA, bool rawOrderA,
bool append) {
bool append, bool discardDiagA) {
text = nullptr;
physLayout = physLayoutA;
fixedPitch = physLayout ? fixedPitchA : 0;
rawOrder = rawOrderA;
discardDiag = discardDiagA;
doHTML = false;
ok = true;
......@@ -5679,21 +5705,22 @@ TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA,
}
// set up text object
text = new TextPage(rawOrderA);
text = new TextPage(rawOrderA, discardDiagA);
actualText = new ActualText(text);
}
TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
bool physLayoutA, double fixedPitchA,
bool rawOrderA) {
bool rawOrderA, bool discardDiagA) {
outputFunc = func;
outputStream = stream;
needClose = false;
physLayout = physLayoutA;
fixedPitch = physLayout ? fixedPitchA : 0;
rawOrder = rawOrderA;
discardDiag = discardDiagA;
doHTML = false;
text = new TextPage(rawOrderA);
text = new TextPage(rawOrderA, discardDiagA);
actualText = new ActualText(text);
ok = true;
}
......@@ -5961,7 +5988,7 @@ TextPage *TextOutputDev::takeText() {
TextPage *ret;
ret = text;
text = new TextPage(rawOrder);
text = new TextPage(rawOrder, discardDiag);
return ret;
}
......
......@@ -553,7 +553,7 @@ class TextPage {
public:
// Constructor.
TextPage(bool rawOrderA);
TextPage(bool rawOrderA, bool discardDiagA = false);
TextPage(const TextPage &) = delete;
TextPage& operator=(const TextPage &) = delete;
......@@ -685,6 +685,7 @@ private:
int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
bool rawOrder; // keep text in content stream order
bool discardDiag; // discard diagonal text
bool mergeCombining; // merge when combining and base characters
// are drawn on top of each other
......@@ -698,6 +699,7 @@ private:
int nTinyChars; // number of "tiny" chars seen so far
bool lastCharOverlap; // set if the last added char overlapped the
// previous char
bool diagonal; // whether the current text is diagonal
TextPool *pools[4]; // a "pool" of TextWords for each rotation
TextFlow *flows; // linked list of flows
......@@ -772,18 +774,20 @@ public:
// written (this is useful, e.g., for searching text). If
// <physLayoutA> is true, the original physical layout of the text
// is maintained. If <rawOrder> is true, the text is kept in
// content stream order.
// content stream order. If <discardDiag> is true, diagonal text
// is removed from output.
TextOutputDev(const char *fileName, bool physLayoutA,
double fixedPitchA, bool rawOrderA,
bool append);
bool append, bool discardDiagA = false);
// Create a TextOutputDev which will write to a generic stream. If
// <physLayoutA> is true, the original physical layout of the text
// is maintained. If <rawOrder> is true, the text is kept in
// content stream order.
// content stream order. If <discardDiag> is true, diagonal text
// is removed from output.
TextOutputDev(TextOutputFunc func, void *stream,
bool physLayoutA, double fixedPitchA,
bool rawOrderA);
bool rawOrderA, bool discardDiagA = false);
// Destructor.
~TextOutputDev();
......@@ -920,6 +924,9 @@ private:
// assume fixed-pitch characters with this
// width
bool rawOrder; // keep text in content stream order
bool discardDiag; // Diagonal text, i.e., text that is not close to one of the
//0, 90, 180, or 270 degree axes, is discarded. This is useful
// to skip watermarks drawn on top of body text, etc.
bool doHTML; // extra processing for HTML conversion
bool ok; // set up ok?
......
......@@ -62,6 +62,11 @@ Keep the text in content stream order. This is a hack which often
"undoes" column formatting, etc. Use of raw mode is no longer
recommended.
.TP
.B \-nodiag
Discard diagonal text (i.e., text that is not close to one of the
0, 90, 180, or 270 degree axes). This is useful for skipping
watermarks drawn on body text.
.TP
.B \-htmlmeta
Generate a simple HTML file, including the meta information. This
simply wraps the text in <pre> and </pre> and prepends the meta
......
......@@ -83,6 +83,7 @@ static bool bboxLayout = false;
static bool physLayout = false;
static double fixedPitch = 0;
static bool rawOrder = false;
static bool discardDiag = false;
static bool htmlMeta = false;
static char textEncName[128] = "";
static char textEOL[16] = "";
......@@ -115,6 +116,8 @@ static const ArgDesc argDesc[] = {
"assume fixed-pitch (or tabular) text"},
{"-raw", argFlag, &rawOrder, 0,
"keep strings in content stream order"},
{"-nodiag", argFlag, &discardDiag, 0,
"discard diagonal text"},
{"-htmlmeta", argFlag, &htmlMeta, 0,
"generate a simple HTML file, including the meta information"},
{"-enc", argString, textEncName, sizeof(textEncName),
......@@ -363,7 +366,7 @@ int main(int argc, char *argv[]) {
// write text file
if (htmlMeta && bbox) { // htmlMeta && is superfluous but makes gcc happier
textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta);
textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
if (textOut->isOk()) {
if (bboxLayout) {
......@@ -378,7 +381,7 @@ int main(int argc, char *argv[]) {
}
} else {
textOut = new TextOutputDev(textFileName->c_str(),
physLayout, fixedPitch, rawOrder, htmlMeta);
physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag);
if (textOut->isOk()) {
if ((w==0) && (h==0) && (x==0) && (y==0)) {
doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment