Javascript: Unicode string to hex

208,484

Solution 1

Remember that a JavaScript code unit is 16 bits wide. Therefore the hex string form will be 4 digits per code unit.

usage:

var str = "\u6f22\u5b57"; // "\u6f22\u5b57" === "漢字"
alert(str.hexEncode().hexDecode());

String to hex form:

String.prototype.hexEncode = function(){
    var hex, i;

    var result = "";
    for (i=0; i<this.length; i++) {
        hex = this.charCodeAt(i).toString(16);
        result += ("000"+hex).slice(-4);
    }

    return result
}

Back again:

String.prototype.hexDecode = function(){
    var j;
    var hexes = this.match(/.{1,4}/g) || [];
    var back = "";
    for(j = 0; j<hexes.length; j++) {
        back += String.fromCharCode(parseInt(hexes[j], 16));
    }

    return back;
}

Solution 2

Here is a tweak of McDowell's algorithm that doesn't pad the result:

  function toHex(str) {
    var result = '';
    for (var i=0; i<str.length; i++) {
      result += str.charCodeAt(i).toString(16);
    }
    return result;
  }

Solution 3

It depends on what encoding you use. If you want to convert utf-8 encoded hex to string, use this:

function fromHex(hex,str){
  try{
    str = decodeURIComponent(hex.replace(/(..)/g,'%$1'))
  }
  catch(e){
    str = hex
    console.log('invalid hex input: ' + hex)
  }
  return str
}

For the other direction use this:

function toHex(str,hex){
  try{
    hex = unescape(encodeURIComponent(str))
    .split('').map(function(v){
      return v.charCodeAt(0).toString(16)
    }).join('')
  }
  catch(e){
    hex = str
    console.log('invalid text input: ' + str)
  }
  return hex
}

Solution 4

A more up to date solution, for encoding:

// This is the same for all of the below, and
// you probably won't need it except for debugging
// in most cases.
function bytesToHex(bytes) {
  return Array.from(
    bytes,
    byte => byte.toString(16).padStart(2, "0")
  ).join("");
}

// You almost certainly want UTF-8, which is
// now natively supported:
function stringToUTF8Bytes(string) {
  return new TextEncoder().encode(string);
}

// But you might want UTF-16 for some reason.
// .charCodeAt(index) will return the underlying
// UTF-16 code-units (not code-points!), so you
// just need to format them in whichever endian order you want.
function stringToUTF16Bytes(string, littleEndian) {
  const bytes = new Uint8Array(string.length * 2);
  // Using DataView is the only way to get a specific
  // endianness.
  const view = new DataView(bytes.buffer);
  for (let i = 0; i != string.length; i++) {
    view.setUint16(i, string.charCodeAt(i), littleEndian);
  }
  return bytes;
}

// And you might want UTF-32 in even weirder cases.
// Fortunately, iterating a string gives the code
// points, which are identical to the UTF-32 encoding,
// though you still have the endianess issue.
function stringToUTF32Bytes(string, littleEndian) {
  const codepoints = Array.from(string, c => c.codePointAt(0));
  const bytes = new Uint8Array(codepoints.length * 4);
  // Using DataView is the only way to get a specific
  // endianness.
  const view = new DataView(bytes.buffer);
  for (let i = 0; i != codepoints.length; i++) {
    view.setUint32(i, codepoints[i], littleEndian);
  }
  return bytes;
}

Examples:

bytesToHex(stringToUTF8Bytes("hello 漢字 👍"))
// "68656c6c6f20e6bca2e5ad9720f09f918d"
bytesToHex(stringToUTF16Bytes("hello 漢字 👍", false))
// "00680065006c006c006f00206f225b570020d83ddc4d"
bytesToHex(stringToUTF16Bytes("hello 漢字 👍", true))
// "680065006c006c006f002000226f575b20003dd84ddc"
bytesToHex(stringToUTF32Bytes("hello 漢字 👍", false))
// "00000068000000650000006c0000006c0000006f0000002000006f2200005b57000000200001f44d"
bytesToHex(stringToUTF32Bytes("hello 漢字 👍", true))
// "68000000650000006c0000006c0000006f00000020000000226f0000575b0000200000004df40100"

For decoding, it's generally a lot simpler, you just need:

function hexToBytes(hex) {
    const bytes = new Uint8Array(hex.length / 2);
    for (let i = 0; i !== bytes.length; i++) {
        bytes[i] = parseInt(hex.substr(i * 2, 2), 16);
    }
    return bytes;
}

then use the encoding parameter of TextDecoder:

// UTF-8 is default
new TextDecoder().decode(hexToBytes("68656c6c6f20e6bca2e5ad9720f09f918d"));
// but you can also use:
new TextDecoder("UTF-16LE").decode(hexToBytes("680065006c006c006f002000226f575b20003dd84ddc"))
new TextDecoder("UTF-16BE").decode(hexToBytes("00680065006c006c006f00206f225b570020d83ddc4d"));
// "hello 漢字 👍"

Here's the list of allowed encoding names: https://www.w3.org/TR/encoding/#names-and-labels

You might notice UTF-32 is not on that list, which is a pain, so:

function bytesToStringUTF32(bytes, littleEndian) {
  const view = new DataView(bytes.buffer);
  const codepoints = new Uint32Array(view.byteLength / 4);
  for (let i = 0; i !== codepoints.length; i++) {
    codepoints[i] = view.getUint32(i * 4, littleEndian);
  }
  return String.fromCodePoint(...codepoints);
}

Then:

bytesToStringUTF32(hexToBytes("00000068000000650000006c0000006c0000006f0000002000006f2200005b57000000200001f44d"), false)
bytesToStringUTF32(hexToBytes("68000000650000006c0000006c0000006f00000020000000226f0000575b0000200000004df40100"), true)
// "hello 漢字 👍"

Solution 5

how do you get "\u6f22\u5b57" from 漢字 in JavaScript?

These are JavaScript Unicode escape sequences e.g. \u12AB. To convert them, you could iterate over every code unit in the string, call .toString(16) on it, and go from there.

However, it is more efficient to also use hexadecimal escape sequences e.g. \xAA in the output wherever possible.

Also note that ASCII symbols such as A, b, and - probably don’t need to be escaped.

I’ve written a small JavaScript library that does all this for you, called jsesc. It has lots of options to control the output.

Here’s an online demo of the tool in action: http://mothereff.in/js-escapes#1%E6%BC%A2%E5%AD%97


Your question was tagged as utf-8. Reading the rest of your question, UTF-8 encoding/decoding didn’t seem to be what you wanted here, but in case you ever need it: use utf8.js (online demo).

Share:
208,484
Admin
Author by

Admin

Updated on October 03, 2021

Comments

  • Admin
    Admin over 2 years

    I'm trying to convert a unicode string to a hexadecimal representation in javascript.

    This is what I have:

    function convertFromHex(hex) {
        var hex = hex.toString();//force conversion
        var str = '';
        for (var i = 0; i < hex.length; i += 2)
            str += String.fromCharCode(parseInt(hex.substr(i, 2), 16));
        return str;
    }
    
    function convertToHex(str) {
        var hex = '';
        for(var i=0;i<str.length;i++) {
            hex += ''+str.charCodeAt(i).toString(16);
        }
        return hex;
    }
    

    But if fails on unicode characters, like chinese;

    Input: 漢字

    Output: ªo"[W

    Any ideas? Can this be done in javascript?

  • Admin
    Admin about 10 years
    Thanks, just 1 question though (may be a dumb one..) -- how do you get \u6f22\u5b57 from 漢字 in javascript? Closest is with the escape() function but this uses % - I guess a regex of sorts could be used to replace % with / - but the escape() function is also deprecated. EncodeURI and encodeURIComponent both give a different output. Any idea?
  • McDowell
    McDowell about 10 years
    "\u6f22\u5b57" is the Unicode escape form of the literal "漢字" in the same way that \n is the newline character. I tend to use them to avoid ambiguity and avoid character encoding issues. See the specification for details. To generate them yourself change the above ("000"+hex).slice(-4) to "\\u" + ("000"+hex).slice(-4). The expression "\u6f22\u5b57" === "漢字" evaluates to true because after code parsing they are the same.
  • Admin
    Admin about 10 years
    Thanks, 1 issue I'm running into, sometimes hex.match(//.{1,4}/g); does not match anything. (error: null is not an object (evaluating hexes.length)) - do you know what could be the cause?
  • McDowell
    McDowell about 10 years
    Must be an empty string. You can switch to using the substr method you used before (using width 4 instead) or use var hexes = result.match(/.{1,4}/g) || [];
  • Admin
    Admin about 10 years
    Thanks for your fast reply! Actually seems to be an issue with ascii. I have a mesage "test", which was converted in hex to "74657374". With your method to convert back, I get "瑥獴" instead of "test". (I do get back the original input with the convertFromHex method in my question) Any ideas?
  • McDowell
    McDowell about 10 years
    If you were using the top algorithm as written "test" encodes to "0074006500730074". There is no ASCII. JavaScript strings are always UTF-16.
  • Admin
    Admin about 10 years
    if only using simple ascii, is it not possible to use my original method? Is it in any way possible to detect -> input == ascii only -> use original method, otherwise use your method. I'm guessing that's the easy part, but the decoding may be harder to detect which method was used...
  • McDowell
    McDowell about 10 years
  • Inverse
    Inverse almost 9 years
    Why would you not want to pad? now the hex output is ambiguous
  • martian17
    martian17 about 6 years
    I fixed the hexDecode function since it didn't seem to work; var a = "\\x73\\x75\\x62\\x73\\x74\\x72"; var str = "\\u6f22\\u5b57"; String.prototype.hexDecode = function(){ var j; var hexes = this.split("\\"); var back = ""; for(j = 1; j<hexes.length; j++) { var xhex = hexes[j]; var hex = xhex.slice(1); back += String.fromCharCode(parseInt(hex, 16)); } return back; }; a.hexDecode(); //"substr" str.hexDecode(); //"漢字" this also works for Hexadecimal escape sequences
  • redgeoff
    redgeoff about 6 years
    e.g. if you needed to hex encode a string or something similar
  • kyw
    kyw about 5 years
    Not sure what I'm looking at, but this is useful for me to get user's private CouchDB database! Thanks
  • Munawwar
    Munawwar about 4 years
    for the toHex function, if hex < 10, it needs '0' padding.. if \n or \t appears in the text, it would appear as '9' or 'a'.. but it should be '09' and '0a' respectively.
  • Munawwar
    Munawwar about 4 years
    you can change it to return v.charCodeAt(0).toString(16).padStart(2, '0')
  • JMerinoH
    JMerinoH almost 4 years
    Kudos @redgeoff! this solution works when passing the string into PHP and decoding with hex2bin().
  • Boris Verkhovskiy
    Boris Verkhovskiy about 3 years
    putting the usage of some functions you're going to define later is confusing, I assumed JS had these natively.
  • KMA Badshah
    KMA Badshah over 2 years
    It's important to remember that the result will be in big-endian format.
  • Jon R
    Jon R about 2 years
    Thank you! Two hours of searching and all I needed was this: function stringToUTF8Bytes(string) { return new TextEncoder().encode(string); }