Commit 6f638621 authored by Adrian Johnson's avatar Adrian Johnson Committed by Albert Astals Cid
Browse files

Convert UTF-16 to UCS-4 when reading toUnicode cmap

to ensure only UCS-4 values are used with the "Unicode" type.
parent b3b0f5ab
......@@ -296,6 +296,7 @@ set(poppler_SRCS
poppler/strtok_r.cpp
poppler/UnicodeMap.cc
poppler/UnicodeTypeTable.cc
poppler/UTF.cc
poppler/XRef.cc
poppler/PSOutputDev.cc
poppler/TextOutputDev.cc
......@@ -466,7 +467,7 @@ if(ENABLE_XPDF_HEADERS)
poppler/SecurityHandler.h
poppler/StdinCachedFile.h
poppler/StdinPDFDocBuilder.h
poppler/UTF8.h
poppler/UTF.h
poppler/XpdfPluginAPI.h
poppler/Sound.h
${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h
......
......@@ -61,7 +61,7 @@
#include "CairoOutputDev.h"
#include "CairoFontEngine.h"
#include "CairoRescaleBox.h"
#include "UTF8.h"
#include "UTF.h"
//------------------------------------------------------------------------
// #define LOG_CAIRO
......
......@@ -43,6 +43,7 @@
#include "GlobalParams.h"
#include "PSTokenizer.h"
#include "CharCodeToUnicode.h"
#include "UTF.h"
//------------------------------------------------------------------------
......@@ -453,15 +454,16 @@ void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n,
}
map[code] = 0;
sMap[sMapLen].c = code;
sMap[sMapLen].len = n / 4;
sMap[sMapLen].u = (Unicode*)gmallocn(sMap[sMapLen].len, sizeof(Unicode));
for (j = 0; j < sMap[sMapLen].len; ++j) {
if (!parseHex(uStr + j*4, 4, &sMap[sMapLen].u[j])) {
int utf16Len = n / 4;
Unicode *utf16 = (Unicode*)gmallocn(utf16Len, sizeof(Unicode));
for (j = 0; j < utf16Len; ++j) {
if (!parseHex(uStr + j*4, 4, &utf16[j])) {
error(errSyntaxWarning, -1, "Illegal entry in ToUnicode CMap");
return;
}
}
sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset;
utf16[utf16Len - 1] += offset;
sMap[sMapLen].len = UTF16toUCS4(utf16, utf16Len, &sMap[sMapLen].u);
++sMapLen;
}
}
......
......@@ -108,7 +108,7 @@
#include "NameToUnicodeTable.h"
#include "UnicodeMapTables.h"
#include "UTF8.h"
#include "UTF.h"
#ifdef ENABLE_PLUGINS
# ifdef _WIN32
......
......@@ -251,7 +251,7 @@ poppler_include_HEADERS = \
PSOutputDev.h \
TextOutputDev.h \
SecurityHandler.h \
UTF8.h \
UTF.h \
XpdfPluginAPI.h \
Sound.h
nodist_poppler_include_HEADERS = poppler-config.h
......@@ -317,6 +317,7 @@ libpoppler_la_SOURCES = \
strtok_r.cpp \
UnicodeMap.cc \
UnicodeTypeTable.cc \
UTF.cc \
ViewerPreferences.cc \
XRef.cc \
PSOutputDev.cc \
......
......@@ -2392,24 +2392,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
w1 /= uLen;
h1 /= uLen;
for (i = 0; i < uLen; ++i) {
if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */
if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) {
/* next code is a low surrogate */
Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
i++;
curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu);
} else {
/* missing low surrogate
replace it with REPLACEMENT CHARACTER (U+FFFD) */
curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
}
} else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
/* invalid low surrogate
replace it with REPLACEMENT CHARACTER (U+FFFD) */
curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
} else {
curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
}
curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
}
}
charPos += nBytes;
......
#include "goo/gmem.h"
#include "UTF.h"
int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
{
int i, n, len;
Unicode *u;
// count characters
len = 0;
for (i = 0; i < utf16Len; i++) {
if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len &&
utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
i++; /* surrogate pair */
}
len++;
}
if (ucs4 == NULL)
return len;
u = (Unicode*)gmallocn(len, sizeof(Unicode));
n = 0;
// convert string
for (i = 0; i < utf16Len; i++) {
if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
/* next code is a low surrogate */
u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000;
++i;
} else {
/* missing low surrogate
replace it with REPLACEMENT CHARACTER (U+FFFD) */
u[n] = 0xfffd;
}
} else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
/* invalid low surrogate
replace it with REPLACEMENT CHARACTER (U+FFFD) */
u[n] = 0xfffd;
} else {
u[n] = utf16[i];
}
n++;
}
*ucs4 = u;
return len;
}
//========================================================================
//
// UTF8.h
// UTF.h
//
// Copyright 2001-2003 Glyph & Cog, LLC
//
......@@ -20,6 +20,23 @@
//
//========================================================================
#ifndef UTF_H
#define UTF_H
#ifdef USE_GCC_PRAGMAS
#pragma implementation
#endif
#include "CharTypes.h"
// Convert a UTF-16 string to a UCS-4
// utf16 - utf16 bytes
// utf16_len - number of UTF-16 characters
// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
// returns number of UCS-4 characters
int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);
static int mapUTF8(Unicode u, char *buf, int bufSize) {
if (u <= 0x0000007f) {
if (bufSize < 1) {
......@@ -60,7 +77,7 @@ static int mapUCS2(Unicode u, char *buf, int bufSize) {
if (u <= 0xffff) {
if (bufSize < 2) {
return 0;
}
}
buf[0] = (char)((u >> 8) & 0xff);
buf[1] = (char)(u & 0xff);
return 2;
......@@ -82,3 +99,5 @@ static int mapUCS2(Unicode u, char *buf, int bufSize) {
return 0;
}
}
#endif
......@@ -400,19 +400,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y,
h1 /= uLen;
}
for (i = 0; i < uLen; ++i) {
Unicode u1 = u[i];
if (u1 >= 0xd800 && u1 <= 0xdbff && i < uLen) {
// surrogate pair
const Unicode u2 = u[i + 1];
if (u2 >= 0xdc00 && u2 <= 0xdfff) {
u1 = 0x10000 + ((u1 - 0xd800) << 10) + (u2 - 0xdc00);
curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1);
}
++i;
} else {
curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1);
}
curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment