Commit df6d27b3 authored by Thomas Haller's avatar Thomas Haller

shared: add nm_utils_str_utf8safe_*() API to sanitize UTF-8 strings

Use C-style backslash escaping to sanitize non-UTF-8 strings.
The functions are compatible with glib's g_strcompress() and
g_strescape().

The difference is only that g_strescape() escapes all non-printable,
non ASCII character as well, while nm_utils_str_utf8safe_escape()
-- depending on the flags -- preserves valid UTF-8 sequence except
backslash.

The flags allow to optionally escape ASCII control characters and
all non-ASCII (valid UTF-8) characters. But the option to preserve
valid UTF-8 (non-ASCII) characters verbatim, is what distinguishes
from g_strescape().
parent e216d5ea
......@@ -5323,6 +5323,100 @@ static void test_nm_utils_enum (void)
/*****************************************************************************/
static void
do_test_utils_str_utf8safe (const char *str, const char *expected, NMUtilsStrUtf8SafeFlags flags)
{
const char *str_safe, *s;
gs_free char *str2 = NULL;
gs_free char *str3 = NULL;
str_safe = nm_utils_str_utf8safe_escape (str, flags, &str2);
str3 = nm_utils_str_utf8safe_escape_cp (str, flags);
g_assert_cmpstr (str3, ==, str_safe);
g_assert ((!str && !str3) || (str != str3));
g_clear_pointer (&str3, g_free);
if (expected == NULL) {
g_assert (str_safe == str);
g_assert (!str2);
if (str) {
g_assert (!strchr (str, '\\'));
g_assert (g_utf8_validate (str, -1, NULL));
}
g_assert (str == nm_utils_str_utf8safe_unescape (str_safe, &str3));
g_assert (!str3);
str3 = nm_utils_str_utf8safe_unescape_cp (str_safe);
if (str) {
g_assert (str3 != str);
g_assert_cmpstr (str3, ==, str);
} else
g_assert (!str3);
g_clear_pointer (&str3, g_free);
return;
}
g_assert (str);
g_assert (str_safe != str);
g_assert (str_safe == str2);
g_assert ( strchr (str, '\\')
|| !g_utf8_validate (str, -1, NULL)
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII)
&& NM_STRCHAR_ANY (str, ch, (guchar) ch >= 127))
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL)
&& NM_STRCHAR_ANY (str, ch, (guchar) ch < ' ')));
g_assert (g_utf8_validate (str_safe, -1, NULL));
str3 = g_strcompress (str_safe);
g_assert_cmpstr (str, ==, str3);
g_clear_pointer (&str3, g_free);
str3 = nm_utils_str_utf8safe_unescape_cp (str_safe);
g_assert (str3 != str);
g_assert_cmpstr (str3, ==, str);
g_clear_pointer (&str3, g_free);
s = nm_utils_str_utf8safe_unescape (str_safe, &str3);
g_assert (str3 != str);
g_assert (s == str3);
g_assert_cmpstr (str3, ==, str);
g_clear_pointer (&str3, g_free);
g_assert_cmpstr (str_safe, ==, expected);
}
static void
test_utils_str_utf8safe (void)
{
do_test_utils_str_utf8safe (NULL, NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\314", "\\314", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\314\315x\315\315x", "\\314\\315x\\315\\315x", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\314\315xx", "\\314\\315xx", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\314xx", "\\314xx", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\xa0", "\\240", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\xe2\x91\xa0", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\xe2\xe2\x91\xa0", "\\342\xe2\x91\xa0", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("\xe2\xe2\x91\xa0\xa0", "\\342\xe2\x91\xa0\\240", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("a", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("ab", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("ab\314", "ab\\314", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("ab\314adsf", "ab\\314adsf", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("abadsf", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("abäb", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("x\xa0", "x\\240", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe (\304ab\\äb", \\304ab\\\\äb", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("Äab\\äb", "Äab\\\\äb", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("ÄÄab\\äb", "ÄÄab\\\\äb", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("㈞abä㈞b", NULL, NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
do_test_utils_str_utf8safe ("abäb", "ab\\303\\244b", NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII);
do_test_utils_str_utf8safe ("ab\ab", "ab\\007b", NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL);
}
/*****************************************************************************/
static int
_test_nm_in_set_get (int *call_counter, gboolean allow_called, int value)
{
......@@ -5680,6 +5774,7 @@ int main (int argc, char **argv)
nmtst_init (&argc, &argv, TRUE);
/* The tests */
g_test_add_func ("/core/general/test_utils_str_utf8safe", test_utils_str_utf8safe);
g_test_add_func ("/core/general/test_nm_in_set", test_nm_in_set);
g_test_add_func ("/core/general/test_nm_in_strset", test_nm_in_strset);
g_test_add_func ("/core/general/test_setting_vpn_items", test_setting_vpn_items);
......
......@@ -397,3 +397,141 @@ nm_g_object_set_property (GObject *object,
}
/*****************************************************************************/
static void
_str_append_escape (GString *s, char ch)
{
g_string_append_c (s, '\\');
g_string_append_c (s, '0' + ((((guchar) ch) >> 6) & 07));
g_string_append_c (s, '0' + ((((guchar) ch) >> 3) & 07));
g_string_append_c (s, '0' + ( ((guchar) ch) & 07));
}
/**
* nm_utils_str_utf8safe_escape:
* @str: NUL terminated input string, possibly in utf-8 encoding
* @flags: #NMUtilsStrUtf8SafeFlags flags
* @to_free: (out): return the pointer location of the string
* if a copying was necessary.
*
* Returns the possible non-UTF-8 NUL terminated string @str
* and uses backslash escaping (C escaping, like g_strescape())
* to sanitize non UTF-8 characters. The result is valid
* UTF-8.
*
* The operation can be reverted with g_strcompress() or
* nm_utils_str_utf8safe_unescape().
*
* Depending on @flags, valid UTF-8 characters are not escaped at all
* (except the escape character '\\'). This is the difference to g_strescape(),
* which escapes all non-ASCII characters. This allows to pass on
* valid UTF-8 characters as-is and can be directly shown to the user
* as UTF-8 -- with exception of the backslash escape character,
* invalid UTF-8 sequences, and other (depending on @flags).
*
* Returns: the escaped input string, as valid UTF-8. If no escaping
* is necessary, it returns the input @str. Otherwise, an allocated
* string @to_free is returned which must be freed by the caller
* with g_free. The escaping can be reverted by g_strcompress().
**/
const char *
nm_utils_str_utf8safe_escape (const char *str, NMUtilsStrUtf8SafeFlags flags, char **to_free)
{
const char *p = NULL;
GString *s;
g_return_val_if_fail (to_free, NULL);
*to_free = NULL;
if (!str || !str[0])
return str;
if ( g_utf8_validate (str, -1, &p)
&& !NM_STRCHAR_ANY (str, ch,
( ch == '\\' \
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL) \
&& ch < ' ') \
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII) \
&& ((guchar) ch) >= 127))))
return str;
s = g_string_sized_new ((p - str) + strlen (p) + 5);
do {
for (; str < p; str++) {
char ch = str[0];
if (ch == '\\')
g_string_append (s, "\\\\");
else if ( ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL) \
&& ch < ' ') \
|| ( NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII) \
&& ((guchar) ch) >= 127))
_str_append_escape (s, ch);
else
g_string_append_c (s, ch);
}
if (p[0] == '\0')
break;
_str_append_escape (s, p[0]);
str = &p[1];
g_utf8_validate (str, -1, &p);
} while (TRUE);
*to_free = g_string_free (s, FALSE);
return *to_free;
}
const char *
nm_utils_str_utf8safe_unescape (const char *str, char **to_free)
{
g_return_val_if_fail (to_free, NULL);
if (!str || !strchr (str, '\\')) {
*to_free = NULL;
return str;
}
return (*to_free = g_strcompress (str));
}
/**
* nm_utils_str_utf8safe_escape_cp:
* @str: NUL terminated input string, possibly in utf-8 encoding
* @flags: #NMUtilsStrUtf8SafeFlags flags
*
* Like nm_utils_str_utf8safe_escape(), except the returned value
* is always a copy of the input and must be freed by the caller.
*
* Returns: the escaped input string in UTF-8 encoding. The returned
* value should be freed with g_free().
* The escaping can be reverted by g_strcompress().
**/
char *
nm_utils_str_utf8safe_escape_cp (const char *str, NMUtilsStrUtf8SafeFlags flags)
{
char *s;
nm_utils_str_utf8safe_escape (str, flags, &s);
return s ?: g_strdup (str);
}
char *
nm_utils_str_utf8safe_unescape_cp (const char *str)
{
return str ? g_strcompress (str) : NULL;
}
char *
nm_utils_str_utf8safe_escape_take (char *str, NMUtilsStrUtf8SafeFlags flags)
{
char *str_to_free;
nm_utils_str_utf8safe_escape (str, flags, &str_to_free);
if (str_to_free) {
g_free (str);
return str_to_free;
}
return str;
}
......@@ -97,4 +97,20 @@ gboolean nm_g_object_set_property (GObject *object,
/*****************************************************************************/
typedef enum {
NM_UTILS_STR_UTF8_SAFE_FLAG_NONE = 0,
NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL = 0x0001,
NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII = 0x0002,
} NMUtilsStrUtf8SafeFlags;
const char *nm_utils_str_utf8safe_escape (const char *str, NMUtilsStrUtf8SafeFlags flags, char **to_free);
const char *nm_utils_str_utf8safe_unescape (const char *str, char **to_free);
char *nm_utils_str_utf8safe_escape_cp (const char *str, NMUtilsStrUtf8SafeFlags flags);
char *nm_utils_str_utf8safe_unescape_cp (const char *str);
char *nm_utils_str_utf8safe_escape_take (char *str, NMUtilsStrUtf8SafeFlags flags);
/*****************************************************************************/
#endif /* __NM_SHARED_UTILS_H__ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment