Raised This Month: $85 Target: $400
 21% 

Unicode escape sequences to utf-8 strings converter


Post New Thread Reply   
 
Thread Tools Display Modes
Author Message
butare
Senior Member
Join Date: Nov 2016
Old 08-25-2020 , 18:29   Unicode escape sequences to utf-8 strings converter
Reply With Quote #1

Sharing my unicode escape sequence converter, were in need of sp implementation, and here it is:
Code:
#define INVALID_CODEPOINT 65533

void EscapeUnicodeSequences(const char[] text, char[] buff, int size)
{
	int leading, trailing, codepoint;
	bool valid_unicode;
	for(int i = 0, j = 0; text[i] && j < size; i++)
	{
		// Handles escaped characters and saves them literally
		if(text[i] == '\\' && text[i + 1] != 'u')
		{
			buff[j++] = text[i + 1];
			i++;
			continue;
		}
		
		leading = GetUnicodeCodepoint(text[i], valid_unicode);
		
		if(!valid_unicode)
		{
			buff[j++] = text[i];
			continue;
		}
		
		i += 5;
		
		if(0xD800 <= leading <= 0xDFFF)
		{
			trailing = GetUnicodeCodepoint(text[i + 1], valid_unicode);
			
			if(valid_unicode && 0xD800 <= trailing <= 0xDFFF)
			{
				i += 6;
				
				// http://unicode.org/faq/utf_bom.html
				codepoint = (leading << 10) + trailing + (0x10000 - (0xD800 << 10) - 0xDC00);
			}
			else
				codepoint = INVALID_CODEPOINT;
		}
		else
			codepoint = leading;
		
		if(codepoint < 0 || codepoint > 0x10FFFF)
			codepoint = INVALID_CODEPOINT;
		
		if(0 <= codepoint <= 0x007F)
		{
			if(j + 1 >= size)
				return;
			
			buff[j++] = codepoint;
		}
		else if(0x0080 <= codepoint <= 0x07FF)
		{
			if(j + 2 >= size)
				return;
			
			buff[j++] = 0xC0 | (codepoint >> 6);
			buff[j++] = 0x80 | (codepoint & 0x3F);
		}
		else if(0x0800 <= codepoint <= 0xFFFF)
		{
			if(j + 3 >= size)
				return;
			
			buff[j++] = 0xE0 | (codepoint >> 12);
			buff[j++] = 0x80 | ((codepoint >> 6) & 0x3F);
			buff[j++] = 0x80 | (codepoint & 0x3F);
		}
		else if(0x10000 <= codepoint <= 0x10FFFF)
		{
			if(j + 4 >= size)
				return;
			
			buff[j++] = 0xF0 | (codepoint >> 18);
			buff[j++] = 0x80 | ((codepoint >> 12) & 0x3F);
			buff[j++] = 0x80 | ((codepoint >> 6) & 0x3F);
			buff[j++] = 0x80 | (codepoint & 0x3F);
		}
	}
}

int GetUnicodeCodepoint(const char[] buff, bool &parsed)
{
	parsed = false;
	
	if(buff[0] != '\\' || buff[1] != 'u')
		return 0;
	
	return ParseUnicodeHex(buff[2], parsed);
}

int ParseUnicodeHex(const char[] buff, bool &parsed)
{
	parsed = false;
	
	int result, base = 1;
	for(int i = 3; i >= 0; i--)
	{
		if(buff[i] == '\0')
			return 0;
		
		if('0' <= buff[i] <= '9')
			result += (buff[i] - '0') * base;
		else if('a' <= buff[i] <= 'f')
			result += (buff[i] - 'a' + 10) * base;
		else if('A' <= buff[i] <= 'F')
			result += (buff[i] - 'A' + 10) * base;
		else
			return 0;
		
		base *= 16;
	}
	
	parsed = true;
	return result;
}
Basic usage example:
Code:
char buff[32];
EscapeUnicodeSequences("\u2B50", buff, sizeof(buff));
PrintToChatAll(buff); //Will print ⭐ in game chat

Last edited by butare; 08-25-2020 at 18:38.
butare is offline
Reply


Thread Tools
Display Modes

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

BB code is On
Smilies are On
[IMG] code is On
HTML code is Off

Forum Jump


All times are GMT -4. The time now is 13:22.


Powered by vBulletin®
Copyright ©2000 - 2020, vBulletin Solutions, Inc.
Theme made by Freecode