Unicode supplementary plane support in annotation

Currently, poppler/Annot.cc still assumes each Unicode (UTF-16) character (scalar) takes 2 bytes. (https://gitlab.freedesktop.org/poppler/poppler/-/blob/master/poppler/Annot.cc#L3042, https://gitlab.freedesktop.org/poppler/poppler/-/blob/master/poppler/Annot.cc#L3048-3049)

This is true for BMP (Basic Multilingual Plane) characters. However, some characters like emoji and some rare characters in natural languages, are not in BMP and takes 4 bytes in UTF-16:

>>> # use Python console as an example
>>> "a".encode(encoding="utf-16")[2:]  # BOM stripped
b'a\x00'
>>> "😀".encode(encoding="utf-16")[2:]
b'=\xd8\x00\xde'
>>> "𰻝".encode(encoding="utf-16")[2:]
b'\x83\xd8\xdd\xde'

I have tried to add supplementary plane handling inside HorizontalTextLayouter constructor like this:

diff --git a/poppler/Annot.cc b/poppler/Annot.cc
index e8db39ff..8147d89f 100644
--- a/poppler/Annot.cc
+++ b/poppler/Annot.cc
@@ -3044,18 +3044,31 @@ public:
                 newFontNeeded = false;
             } else {
                 Unicode uChar;
+                int charLength;
                 if (isUnicode) {
                     uChar = (unsigned char)(text->getChar(i)) << 8;
                     uChar += (unsigned char)(text->getChar(i + 1));
+                    charLength = 2;
+                    // If uChar is in supplementary plane, we need to get the next character
+                    // because the font may not have the glyph for the first character.
+                    if (uChar >= 0xD800 && uChar <= 0xDBFF) {
+                        if (i + 3 < text->getLength()) {
+                            uChar = (uChar - 0xD800) * 0x400 + ((unsigned char)(text->getChar(i + 2)) << 8) + (unsigned char)(text->getChar(i + 3)) + 0x10000;
+                            charLength = 4;
+                            printf("uChar: %x\n", uChar);
+                        }
+                    }
                 } else {
                     uChar = pdfDocEncoding[text->getChar(i) & 0xff];
+                    charLength = 1;
                 }
                 const std::string auxFontName = form->getFallbackFontForChar(uChar, *font);
                 if (!auxFontName.empty()) {
+                    printf("auxFontName: %s\n", auxFontName.c_str());
                     std::shared_ptr<GfxFont> auxFont = form->getDefaultResources()->lookupFont(auxFontName.c_str());
 
                     // Here we just layout one char, we don't know if the one afterwards can be layouted with the original font
-                    GooString auxContents = GooString(text->toStr().substr(i, isUnicode ? 2 : 1));
+                    GooString auxContents = GooString(text->toStr().substr(i, charLength));
                     if (isUnicode) {
                         auxContents.prependUnicodeMarker();
                     }
@@ -3070,13 +3083,14 @@ public:
                     // we also need to allow the character if we have not layouted anything yet because otherwise we will end up in an infinite loop
                     // because it is assumed we at least layout one character
                     if (!availableWidth || *availableWidth > 0 || (isUnicode && i == 2) || (!isUnicode && i == 0)) {
-                        i += isUnicode ? 2 : 1;
+                        i += charLength;
                         data.emplace_back(outputText.toStr(), auxFontName, blockWidth, charCount);
                     }
                 } else {
+                    printf("auxFontName: not found\n");
                     error(errSyntaxError, -1, "HorizontalTextLayouter, couldn't find a font for character U+{0:04uX}", uChar);
                     newFontNeeded = false;
-                    i += isUnicode ? 2 : 1;
+                    i += charLength;
                 }
             }
             // Now layout the rest of the text with the original font

However, this does not work (I'm testing this with Okular) as it could not find font to show the new uChar. I'm afraid that further investigation is a bit beyond my knowledge :(

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information