Commit 57c371e3 authored by Thomas Haller's avatar Thomas Haller

shared: add nm_utils_buf_utf8safe_escape() util

We already have nm_utils_str_utf8safe_escape() to convert a
NUL termianted string to an UTF-8 string. nm_utils_str_utf8safe_escape()
operates under the assumption, that the input strig is already valid UTF-8
and returns the input string verbatim. That way, in the common expected
cases, the string just looks like a regular UTF-8 string.
However, in case there are invalid UTF-8 sequences (or a backslash
escape characters), the function will use backslash escaping to encode
the input string as a valid UTF-8 sequence. Note that the escaped
sequence, can be reverted to the original non-UTF-8 string via
unescape.
An example, where this is useful are file names or interface names.
Which are not in a defined encoding, but NUL terminated and commonly ASCII or
UTF-8 encoded.

Extend this, to also handle not NUL terminated buffers. The same
applies, except that the process cannot be reverted via g_strcompress()
-- because the NUL character cannot be unescaped.

This will be useful to escape a Wi-Fi SSID. Commonly we expect the SSID
to be in UTF-8/ASCII encoding and we want to print it verbatim. Only
if that is not the case, we fallback to backslash escaping. However, the
orginal value can be fully recovered via unescape(). The difference
between an SSID and a filename is, that the former can contain '\0'
bytes.
parent ced0dd2e
......@@ -6559,74 +6559,161 @@ test_nm_utils_enum (void)
/*****************************************************************************/
static void
do_test_utils_str_utf8safe (const char *str, const char *expected, NMUtilsStrUtf8SafeFlags flags)
_do_test_utils_str_utf8safe_unescape (const char *str, const char *expected, gsize expected_len)
{
const char *str_safe, *s;
gs_free char *str2 = NULL;
gs_free char *str3 = NULL;
gsize l;
const char *s;
gs_free gpointer buf_free_1 = NULL;
gs_free char *str_free_1 = NULL;
str_safe = nm_utils_str_utf8safe_escape (str, flags, &str2);
s = nm_utils_buf_utf8safe_unescape (str, &l, &buf_free_1);
g_assert_cmpint (expected_len, ==, l);
g_assert_cmpstr (s, ==, expected);
str3 = nm_utils_str_utf8safe_escape_cp (str, flags);
g_assert_cmpstr (str3, ==, str_safe);
g_assert ((!str && !str3) || (str != str3));
g_clear_pointer (&str3, g_free);
if (str == NULL) {
g_assert (!s);
g_assert (!buf_free_1);
g_assert_cmpint (l, ==, 0);
} else {
g_assert (s);
if (!strchr (str, '\\')) {
g_assert (!buf_free_1);
g_assert (s == str);
g_assert_cmpint (l, ==, strlen (str));
} else {
g_assert (buf_free_1);
g_assert (s == buf_free_1);
g_assert (memcmp (s, expected, expected_len) == 0);
}
}
if ( expected
&& l == strlen (expected)) {
/* there are no embeeded NULs. Check that nm_utils_str_utf8safe_unescape() yields the same result. */
s = nm_utils_str_utf8safe_unescape (str, &str_free_1);
g_assert_cmpstr (s, ==, expected);
if (strchr (str, '\\')) {
g_assert (str_free_1 != str);
g_assert (s == str_free_1);
} else
g_assert (s == str);
}
}
#define do_test_utils_str_utf8safe_unescape(str, expected) \
_do_test_utils_str_utf8safe_unescape (""str"", expected, NM_STRLEN (expected))
static void
_do_test_utils_str_utf8safe (const char *str, gsize str_len, const char *expected, NMUtilsStrUtf8SafeFlags flags)
{
const char *str_safe;
const char *buf_safe;
const char *s;
gs_free gpointer buf_free_1 = NULL;
gs_free char *str_free_1 = NULL;
gs_free char *str_free_2 = NULL;
gs_free char *str_free_3 = NULL;
gs_free char *str_free_4 = NULL;
gs_free char *str_free_5 = NULL;
gs_free char *str_free_6 = NULL;
gs_free char *str_free_7 = NULL;
gs_free char *str_free_8 = NULL;
gboolean str_has_nul = FALSE;
buf_safe = nm_utils_buf_utf8safe_escape (str, str_len, flags, &str_free_1);
str_safe = nm_utils_str_utf8safe_escape (str, flags, &str_free_2);
if (str_len == 0) {
g_assert (buf_safe == NULL);
g_assert (str_free_1 == NULL);
g_assert (str_safe == str);
g_assert (str == NULL || str[0] == '\0');
g_assert (str_free_2 == NULL);
} else if (str_len == strlen (str)) {
g_assert (buf_safe);
g_assert_cmpstr (buf_safe, ==, str_safe);
/* nm_utils_buf_utf8safe_escape() can only return a pointer equal to the input string,
* if and only if str_len is negative. Otherwise, the input str won't be NUL terminated
* and cannot be returned. */
g_assert (buf_safe != str);
g_assert (buf_safe == str_free_1);
} else
str_has_nul = TRUE;
str_free_3 = nm_utils_str_utf8safe_escape_cp (str, flags);
g_assert_cmpstr (str_free_3, ==, str_safe);
g_assert ((!str && !str_free_3) || (str != str_free_3));
if (str_len > 0)
_do_test_utils_str_utf8safe_unescape (buf_safe, str, str_len);
if (expected == NULL) {
g_assert (!str_has_nul);
g_assert (str_safe == str);
g_assert (!str2);
g_assert (!str_free_2);
if (str) {
g_assert (!strchr (str, '\\'));
g_assert (g_utf8_validate (str, -1, NULL));
}
g_assert (str == nm_utils_str_utf8safe_unescape (str_safe, &str3));
g_assert (!str3);
g_assert (str == nm_utils_str_utf8safe_unescape (str_safe, &str_free_4));
g_assert (!str_free_4);
str3 = nm_utils_str_utf8safe_unescape_cp (str_safe);
str_free_5 = nm_utils_str_utf8safe_unescape_cp (str_safe);
if (str) {
g_assert (str3 != str);
g_assert_cmpstr (str3, ==, str);
g_assert (str_free_5 != str);
g_assert_cmpstr (str_free_5, ==, str);
} else
g_assert (!str3);
g_clear_pointer (&str3, g_free);
g_assert (!str_free_5);
return;
}
g_assert (str);
g_assert (str_safe != str);
g_assert (str_safe == str2);
g_assert ( strchr (str, '\\')
|| !g_utf8_validate (str, -1, NULL)
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII)
&& NM_STRCHAR_ANY (str, ch, (guchar) ch >= 127))
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL)
&& NM_STRCHAR_ANY (str, ch, (guchar) ch < ' ')));
g_assert (g_utf8_validate (str_safe, -1, NULL));
if (!str_has_nul) {
g_assert (str);
g_assert (str_safe != str);
g_assert (str_safe == str_free_2);
g_assert ( strchr (str, '\\')
|| !g_utf8_validate (str, -1, NULL)
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII)
&& NM_STRCHAR_ANY (str, ch, (guchar) ch >= 127))
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL)
&& NM_STRCHAR_ANY (str, ch, (guchar) ch < ' ')));
g_assert (g_utf8_validate (str_safe, -1, NULL));
str_free_6 = g_strcompress (str_safe);
g_assert_cmpstr (str, ==, str_free_6);
str3 = g_strcompress (str_safe);
g_assert_cmpstr (str, ==, str3);
g_clear_pointer (&str3, g_free);
str_free_7 = nm_utils_str_utf8safe_unescape_cp (str_safe);
g_assert (str_free_7 != str);
g_assert_cmpstr (str_free_7, ==, str);
str3 = nm_utils_str_utf8safe_unescape_cp (str_safe);
g_assert (str3 != str);
g_assert_cmpstr (str3, ==, str);
g_clear_pointer (&str3, g_free);
s = nm_utils_str_utf8safe_unescape (str_safe, &str_free_8);
g_assert (str_free_8 != str);
g_assert (s == str_free_8);
g_assert_cmpstr (str_free_8, ==, str);
s = nm_utils_str_utf8safe_unescape (str_safe, &str3);
g_assert (str3 != str);
g_assert (s == str3);
g_assert_cmpstr (str3, ==, str);
g_clear_pointer (&str3, g_free);
g_assert_cmpstr (str_safe, ==, expected);
return;
}
g_assert_cmpstr (buf_safe, ==, expected);
g_assert_cmpstr (str_safe, ==, expected);
}
#define do_test_utils_str_utf8safe(str, expected, flags) \
_do_test_utils_str_utf8safe (""str"", NM_STRLEN (str), expected, flags)
static void
test_utils_str_utf8safe (void)
{
do_test_utils_str_utf8safe (NULL, NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
_do_test_utils_str_utf8safe (NULL, 0, NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\\", "\\\\", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\\a", "\\\\a", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\314", "\\314", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\314\315x\315\315x", "\\314\\315x\\315\\315x", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\314\315xx", "\\314\\315xx", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
......@@ -6648,6 +6735,18 @@ test_utils_str_utf8safe (void)
do_test_utils_str_utf8safe ("㈞abä㈞b", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("abäb", "ab\\303\\244b", NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII);
do_test_utils_str_utf8safe ("ab\ab", "ab\\007b", NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL);
do_test_utils_str_utf8safe ("\0", "\\000", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\0a\0", "\\000a\\000", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\\\0", "\\\\\\000", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\n\0", "\n\\000", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\n\0", "\\012\\000", NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL);
do_test_utils_str_utf8safe_unescape ("\n\\0", "\n\0");
do_test_utils_str_utf8safe_unescape ("\n\\01", "\n\01");
do_test_utils_str_utf8safe_unescape ("\n\\012", "\n\012");
do_test_utils_str_utf8safe_unescape ("\n\\.", "\n.");
do_test_utils_str_utf8safe_unescape ("\\n\\.3\\r", "\n.3\r");
}
/*****************************************************************************/
......
......@@ -1077,20 +1077,112 @@ _str_append_escape (GString *s, char ch)
g_string_append_c (s, '0' + ( ((guchar) ch) & 07));
}
gconstpointer
nm_utils_buf_utf8safe_unescape (const char *str, gsize *out_len, gpointer *to_free)
{
GString *gstr;
gsize len;
const char *s;
g_return_val_if_fail (to_free, NULL);
g_return_val_if_fail (out_len, NULL);
if (!str) {
*out_len = 0;
*to_free = NULL;
return NULL;
}
len = strlen (str);
s = memchr (str, '\\', len);
if (!s) {
*out_len = len;
*to_free = NULL;
return str;
}
gstr = g_string_new_len (NULL, len);
g_string_append_len (gstr, str, s - str);
str = s;
for (;;) {
char ch;
guint v;
nm_assert (str[0] == '\\');
ch = (++str)[0];
if (ch == '\0') {
// error. Trailing '\\'
break;
}
if (ch >= '0' && ch <= '9') {
v = ch - '0';
ch = (++str)[0];
if (ch >= '0' && ch <= '7') {
v = v * 8 + (ch - '0');
ch = (++str)[0];
if (ch >= '0' && ch <= '7') {
v = v * 8 + (ch - '0');
ch = (++str)[0];
}
}
ch = v;
} else {
switch (ch) {
case 'b': ch = '\b'; break;
case 'f': ch = '\f'; break;
case 'n': ch = '\n'; break;
case 'r': ch = '\r'; break;
case 't': ch = '\t'; break;
case 'v': ch = '\v'; break;
default:
/* Here we handle "\\\\", but all other unexpected escape sequences are really a bug.
* Take them literally, after removing the escape character */
break;
}
str++;
}
g_string_append_c (gstr, ch);
s = strchr (str, '\\');
if (!s) {
g_string_append (gstr, str);
break;
}
g_string_append_len (gstr, str, s - str);
str = s;
}
*out_len = gstr->len;
*to_free = gstr->str;
return g_string_free (gstr, FALSE);
}
/**
* nm_utils_str_utf8safe_escape:
* @str: NUL terminated input string, possibly in utf-8 encoding
* nm_utils_buf_utf8safe_escape:
* @buf: byte array, possibly in utf-8 encoding, may have NUL characters.
* @buflen: the length of @buf in bytes, or -1 if @buf is a NUL terminated
* string.
* @flags: #NMUtilsStrUtf8SafeFlags flags
* @to_free: (out): return the pointer location of the string
* if a copying was necessary.
*
* Returns the possible non-UTF-8 NUL terminated string @str
* and uses backslash escaping (C escaping, like g_strescape())
* to sanitize non UTF-8 characters. The result is valid
* Based on the assumption, that @buf contains UTF-8 encoded bytes,
* this will return valid UTF-8 sequence, and invalid sequences
* will be escaped with backslash (C escaping, like g_strescape()).
* This is sanitize non UTF-8 characters. The result is valid
* UTF-8.
*
* The operation can be reverted with g_strcompress() or
* nm_utils_str_utf8safe_unescape().
* The operation can be reverted with nm_utils_buf_utf8safe_unescape().
* Note that if, and only if @buf contains no NUL bytes, the operation
* can also be reverted with g_strcompress().
*
* Depending on @flags, valid UTF-8 characters are not escaped at all
* (except the escape character '\\'). This is the difference to g_strescape(),
......@@ -1099,61 +1191,105 @@ _str_append_escape (GString *s, char ch)
* as UTF-8 -- with exception of the backslash escape character,
* invalid UTF-8 sequences, and other (depending on @flags).
*
* Returns: the escaped input string, as valid UTF-8. If no escaping
* is necessary, it returns the input @str. Otherwise, an allocated
* Returns: the escaped input buffer, as valid UTF-8. If no escaping
* is necessary, it returns the input @buf. Otherwise, an allocated
* string @to_free is returned which must be freed by the caller
* with g_free. The escaping can be reverted by g_strcompress().
**/
const char *
nm_utils_str_utf8safe_escape (const char *str, NMUtilsStrUtf8SafeFlags flags, char **to_free)
nm_utils_buf_utf8safe_escape (gconstpointer buf, gssize buflen, NMUtilsStrUtf8SafeFlags flags, char **to_free)
{
const char *const str = buf;
const char *p = NULL;
GString *s;
const char *s;
gboolean nul_terminated = FALSE;
GString *gstr;
g_return_val_if_fail (to_free, NULL);
*to_free = NULL;
if (!str || !str[0])
return str;
if ( g_utf8_validate (str, -1, &p)
&& !NM_STRCHAR_ANY (str, ch,
( ch == '\\' \
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL) \
&& ch < ' ') \
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII) \
&& ((guchar) ch) >= 127))))
return str;
if (buflen == 0)
return NULL;
s = g_string_sized_new ((p - str) + strlen (p) + 5);
if (buflen < 0) {
if (!str)
return NULL;
buflen = strlen (str);
if (buflen == 0)
return str;
nul_terminated = TRUE;
}
if ( g_utf8_validate (str, buflen, &p)
&& nul_terminated) {
/* note that g_utf8_validate() does not allow NUL character inside @str. Good.
* We can treat @str like a NUL terminated string. */
if (!NM_STRCHAR_ANY (str, ch,
( ch == '\\' \
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL) \
&& ch < ' ') \
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII) \
&& ((guchar) ch) >= 127))))
return str;
}
gstr = g_string_sized_new (buflen + 5);
s = str;
do {
for (; str < p; str++) {
char ch = str[0];
buflen -= p - s;
nm_assert (buflen >= 0);
for (; s < p; s++) {
char ch = s[0];
if (ch == '\\')
g_string_append (s, "\\\\");
g_string_append (gstr, "\\\\");
else if ( ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL) \
&& ch < ' ') \
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII) \
&& ((guchar) ch) >= 127))
_str_append_escape (s, ch);
_str_append_escape (gstr, ch);
else
g_string_append_c (s, ch);
g_string_append_c (gstr, ch);
}
if (p[0] == '\0')
if (buflen <= 0)
break;
_str_append_escape (s, p[0]);
str = &p[1];
g_utf8_validate (str, -1, &p);
_str_append_escape (gstr, p[0]);
buflen--;
if (buflen == 0)
break;
s = &p[1];
g_utf8_validate (s, buflen, &p);
} while (TRUE);
*to_free = g_string_free (s, FALSE);
*to_free = g_string_free (gstr, FALSE);
return *to_free;
}
const char *
nm_utils_buf_utf8safe_escape_bytes (GBytes *bytes, NMUtilsStrUtf8SafeFlags flags, char **to_free)
{
gconstpointer p;
gsize l;
if (bytes)
p = g_bytes_get_data (bytes, &l);
else {
p = NULL;
l = 0;
}
return nm_utils_buf_utf8safe_escape (p, l, flags, to_free);
}
/*****************************************************************************/
const char *
nm_utils_str_utf8safe_unescape (const char *str, char **to_free)
{
......@@ -1166,6 +1302,39 @@ nm_utils_str_utf8safe_unescape (const char *str, char **to_free)
return (*to_free = g_strcompress (str));
}
/**
* nm_utils_str_utf8safe_escape:
* @str: NUL terminated input string, possibly in utf-8 encoding
* @flags: #NMUtilsStrUtf8SafeFlags flags
* @to_free: (out): return the pointer location of the string
* if a copying was necessary.
*
* Returns the possible non-UTF-8 NUL terminated string @str
* and uses backslash escaping (C escaping, like g_strescape())
* to sanitize non UTF-8 characters. The result is valid
* UTF-8.
*
* The operation can be reverted with g_strcompress() or
* nm_utils_str_utf8safe_unescape().
*
* Depending on @flags, valid UTF-8 characters are not escaped at all
* (except the escape character '\\'). This is the difference to g_strescape(),
* which escapes all non-ASCII characters. This allows to pass on
* valid UTF-8 characters as-is and can be directly shown to the user
* as UTF-8 -- with exception of the backslash escape character,
* invalid UTF-8 sequences, and other (depending on @flags).
*
* Returns: the escaped input string, as valid UTF-8. If no escaping
* is necessary, it returns the input @str. Otherwise, an allocated
* string @to_free is returned which must be freed by the caller
* with g_free. The escaping can be reverted by g_strcompress().
**/
const char *
nm_utils_str_utf8safe_escape (const char *str, NMUtilsStrUtf8SafeFlags flags, char **to_free)
{
return nm_utils_buf_utf8safe_escape (str, -1, flags, to_free);
}
/**
* nm_utils_str_utf8safe_escape_cp:
* @str: NUL terminated input string, possibly in utf-8 encoding
......
......@@ -472,6 +472,10 @@ typedef enum {
NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII = 0x0002,
} NMUtilsStrUtf8SafeFlags;
const char *nm_utils_buf_utf8safe_escape (gconstpointer buf, gssize buflen, NMUtilsStrUtf8SafeFlags flags, char **to_free);
const char *nm_utils_buf_utf8safe_escape_bytes (GBytes *bytes, NMUtilsStrUtf8SafeFlags flags, char **to_free);
gconstpointer nm_utils_buf_utf8safe_unescape (const char *str, gsize *out_len, gpointer *to_free);
const char *nm_utils_str_utf8safe_escape (const char *str, NMUtilsStrUtf8SafeFlags flags, char **to_free);
const char *nm_utils_str_utf8safe_unescape (const char *str, char **to_free);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment