Unicode escape sequences to utf-8 strings converter

butare · **Author**

Sharing my unicode escape sequence converter, were in need of sp implementation, and here it is:

Code:

#define INVALID_CODEPOINT 65533

void EscapeUnicodeSequences(const char[] text, char[] buff, int size)
{
	int leading, trailing, codepoint;
	bool valid_unicode;
	for(int i = 0, j = 0; text[i] && j < size; i++)
	{
		// Handles escaped characters and saves them literally
		if(text[i] == '\\' && text[i + 1] != 'u')
		{
			buff[j++] = text[i + 1];
			i++;
			continue;
		}
		
		leading = GetUnicodeCodepoint(text[i], valid_unicode);
		
		if(!valid_unicode)
		{
			buff[j++] = text[i];
			continue;
		}
		
		i += 5;
		
		if(0xD800 <= leading <= 0xDFFF)
		{
			trailing = GetUnicodeCodepoint(text[i + 1], valid_unicode);
			
			if(valid_unicode && 0xD800 <= trailing <= 0xDFFF)
			{
				i += 6;
				
				// http://unicode.org/faq/utf_bom.html
				codepoint = (leading << 10) + trailing + (0x10000 - (0xD800 << 10) - 0xDC00);
			}
			else
				codepoint = INVALID_CODEPOINT;
		}
		else
			codepoint = leading;
		
		if(codepoint < 0 || codepoint > 0x10FFFF)
			codepoint = INVALID_CODEPOINT;
		
		if(0 <= codepoint <= 0x007F)
		{
			if(j + 1 >= size)
				return;
			
			buff[j++] = codepoint;
		}
		else if(0x0080 <= codepoint <= 0x07FF)
		{
			if(j + 2 >= size)
				return;
			
			buff[j++] = 0xC0 | (codepoint >> 6);
			buff[j++] = 0x80 | (codepoint & 0x3F);
		}
		else if(0x0800 <= codepoint <= 0xFFFF)
		{
			if(j + 3 >= size)
				return;
			
			buff[j++] = 0xE0 | (codepoint >> 12);
			buff[j++] = 0x80 | ((codepoint >> 6) & 0x3F);
			buff[j++] = 0x80 | (codepoint & 0x3F);
		}
		else if(0x10000 <= codepoint <= 0x10FFFF)
		{
			if(j + 4 >= size)
				return;
			
			buff[j++] = 0xF0 | (codepoint >> 18);
			buff[j++] = 0x80 | ((codepoint >> 12) & 0x3F);
			buff[j++] = 0x80 | ((codepoint >> 6) & 0x3F);
			buff[j++] = 0x80 | (codepoint & 0x3F);
		}
	}
}

int GetUnicodeCodepoint(const char[] buff, bool &parsed)
{
	parsed = false;
	
	if(buff[0] != '\\' || buff[1] != 'u')
		return 0;
	
	return ParseUnicodeHex(buff[2], parsed);
}

int ParseUnicodeHex(const char[] buff, bool &parsed)
{
	parsed = false;
	
	int result, base = 1;
	for(int i = 3; i >= 0; i--)
	{
		if(buff[i] == '\0')
			return 0;
		
		if('0' <= buff[i] <= '9')
			result += (buff[i] - '0') * base;
		else if('a' <= buff[i] <= 'f')
			result += (buff[i] - 'a' + 10) * base;
		else if('A' <= buff[i] <= 'F')
			result += (buff[i] - 'A' + 10) * base;
		else
			return 0;
		
		base *= 16;
	}
	
	parsed = true;
	return result;
}

Basic usage example:

Code:

char buff[32];
EscapeUnicodeSequences("\u2B50", buff, sizeof(buff));
PrintToChatAll(buff); //Will print ⭐ in game chat