UTF.h 3.24 KB
Newer Older
Kristian Høgsberg's avatar
Kristian Høgsberg committed
1 2
//========================================================================
//
3
// UTF.h
Kristian Høgsberg's avatar
Kristian Høgsberg committed
4
//
5
// This file is licensed under the GPLv2 or later
Kristian Høgsberg's avatar
Kristian Høgsberg committed
6
//
Albert Astals Cid's avatar
Albert Astals Cid committed
7
// Copyright (C) 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
Albert Astals Cid's avatar
0.48  
Albert Astals Cid committed
8
// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
Albert Astals Cid's avatar
Albert Astals Cid committed
9
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
10 11 12
//
//========================================================================

13 14 15 16 17 18 19
#ifndef UTF_H
#define UTF_H

#ifdef USE_GCC_PRAGMAS
#pragma implementation
#endif

20 21 22
#include <cstdint>
#include <climits>

23
#include "goo/GooString.h"
24 25 26 27 28 29 30 31 32
#include "CharTypes.h"

// Convert a UTF-16 string to a UCS-4
//   utf16      - utf16 bytes
//   utf16_len  - number of UTF-16 characters
//   ucs4_out   - if not NULL, allocates and returns UCS-4 string. Free with gfree.
//   returns number of UCS-4 characters
int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);

33 34 35 36 37
// Convert a PDF Text String to UCS-4
//   s          - PDF text string
//   ucs4       - if the number of UCS-4 characters is > 0, allocates and
//                returns UCS-4 string. Free with gfree.
//   returns number of UCS-4 characters
Albert Astals Cid's avatar
Albert Astals Cid committed
38
int TextStringToUCS4(const GooString *textStr, Unicode **ucs4);
39

40 41 42
// check if UCS-4 character is valid
bool UnicodeIsValid(Unicode ucs4);

43 44
// is a unicode whitespace character
bool UnicodeIsWhitespace(Unicode ucs4);
45

46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
// Count number of UTF-16 code units required to convert a UTF-8 string
// (excluding terminating NULL). Each invalid byte is counted as a
// code point since the UTF-8 conversion functions will replace it with
// REPLACEMENT_CHAR.
int utf8CountUtf16CodeUnits(const char *utf8);

// Convert UTF-8 to UTF-16
//  utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
//        bytes to convert
//  utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
//  maxUtf16 - maximum size of output buffer including space for null.
//  maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
//            either this count is reached or a null is encountered.
// Returns number of UTF-16 code units written (excluding NULL).
int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16 = INT_MAX, int maxUtf8 = INT_MAX);

// Allocate utf16 string and convert utf8 into it.
uint16_t *utf8ToUtf16(const char *utf8, int *len = nullptr);

// Count number of UTF-8 bytes required to convert a UTF-16 string to
// UTF-8 (excluding terminating NULL).
int utf16CountUtf8Bytes(const uint16_t *utf16);

// Convert UTF-16 to UTF-8
//  utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
//        code units to convert
//  utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
//  maxUtf8 - maximum size of output buffer including space for null.
//  maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
//            either this count is reached or a null is encountered.
// Returns number of UTF-8 bytes written (excluding NULL).
int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX);

// Allocate utf8 string and convert utf16 into it.
char *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr);

82
#endif