Convert from UTF-8 to unicode c++

16,098

Solution 1

I just wrote some code to do this yesterday...

I'm not saying this is the "perfect" way to do this, but it appears to work for all testcases I've run through it (I wrote both directions for that purpose).

I'll leave it to you to translate "%NN" to an integer value.

#include <iostream>
#include <deque>

std::deque<int> unicode_to_utf8(int charcode)
{
    std::deque<int> d;
    if (charcode < 128)
    {
        d.push_back(charcode);
    }
    else
    {
        int first_bits = 6; 
        const int other_bits = 6;
        int first_val = 0xC0;
        int t = 0;
        while (charcode >= (1 << first_bits))
        {
            {
                t = 128 | (charcode & ((1 << other_bits)-1));
                charcode >>= other_bits;
                first_val |= 1 << (first_bits);
                first_bits--;
            }
            d.push_front(t);
        }
        t = first_val | charcode;
        d.push_front(t);
    }
    return d;
}


int utf8_to_unicode(std::deque<int> &coded)
{
    int charcode = 0;
    int t = coded.front();
    coded.pop_front();
    if (t < 128)
    {
        return t;
    }
    int high_bit_mask = (1 << 6) -1;
    int high_bit_shift = 0;
    int total_bits = 0;
    const int other_bits = 6;
    while((t & 0xC0) == 0xC0)
    {
        t <<= 1;
        t &= 0xff;
        total_bits += 6;
        high_bit_mask >>= 1; 
        high_bit_shift++;
        charcode <<= other_bits;
        charcode |= coded.front() & ((1 << other_bits)-1);
        coded.pop_front();
    } 
    charcode |= ((t >> high_bit_shift) & high_bit_mask) << total_bits;
    return charcode;
}

int main()
{
    int charcode; 

    for(;;)
    {
        std::cout << "Enter unicode value:" << std::endl;
        std::cin >> charcode; 
        auto x = unicode_to_utf8(charcode);
        for(auto c : x)
        {
            std::cout << "\\x" << std::hex << c << " ";
        }
        std::cout << std::endl;
        int c = utf8_to_unicode(x);
        std::cout << "reversed:" << std::dec << c << std::hex << " in hex:" << c << std::endl;
    }
}

Solution 2

This is actually in the standard libray:

#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale>  // for std::wstring_convert


std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;


int main() {

    std::string utf8_bytes = "ú";
    std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);

    return 0;
}

The other way around is done with conv_utf8_utf32.to_bytes.

Example with printing in your %hex format using printf:

#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale>  // for std::wstring_convert
#include <cstdio>


std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;


int main() {

    std::string utf8_bytes = "ú";
    // print the bytes in %hex format
    for (char byte: utf8_bytes) {
        printf("%%%2X", reinterpret_cast<unsigned char&>(byte));
    }   
    printf("\n");


    std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);

    // print the code points in %hex format
    for (char32_t chr: unicode_codepoints) {
        printf("%%%2X", chr);
    }   
    printf("\n");


    return 0;
}
Share:
16,098
user2724841
Author by

user2724841

Updated on June 04, 2022

Comments

  • user2724841
    user2724841 almost 2 years

    How do I convert ú within a c++ application where the application receives the character as UTF-8 encoding %C3%BA and store it as the unicode equivalent %FA. I just want to know how I would go about writing code to perform this encoding process