JavaScript strings outside of the BMP

12,353

Solution 1

Depends what you mean by ‘support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.

But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.

If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:

String.prototype.getCodePointLength= function() {
    return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};

String.fromCodePoint= function() {
    var chars= Array.prototype.slice.call(arguments);
    for (var i= chars.length; i-->0;) {
        var n = chars[i]-0x10000;
        if (n>=0)
            chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
    }
    return String.fromCharCode.apply(null, chars);
};

Solution 2

I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.

I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.

You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/

Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.

if (!String.prototype.ucLength) {
    String.prototype.ucLength = function() {
        // this solution was taken from 
        // http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
        return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
    };
}

if (!String.prototype.codePointAt) {
    String.prototype.codePointAt = function (ucPos) {
        if (isNaN(ucPos)){
            ucPos = 0;
        }
        var str = String(this);
        var codePoint = null;
        var pairFound = false;
        var ucIndex = -1;
        var i = 0;  
        while (i < str.length){
            ucIndex += 1;
            var code = str.charCodeAt(i);
            var next = str.charCodeAt(i + 1);
            pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
            if (ucIndex == ucPos){
                codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
                break;
            } else{
                i += pairFound ? 2 : 1;
            }
        }
        return codePoint;
    };
}

if (!String.fromCodePoint) {
    String.fromCodePoint = function () {
        var strChars = [], codePoint, offset, codeValues, i;
        for (i = 0; i < arguments.length; ++i) {
            codePoint = arguments[i];
            offset = codePoint - 0x10000;
            if (codePoint > 0xFFFF){
                codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
            } else{
                codeValues = [codePoint];
            }
            strChars.push(String.fromCharCode.apply(null, codeValues));
        }
        return strChars.join("");
    };
}

if (!String.prototype.ucCharAt) {
    String.prototype.ucCharAt = function (ucIndex) {
        var str = String(this);
        var codePoint = str.codePointAt(ucIndex);
        var ucChar = String.fromCodePoint(codePoint);
        return ucChar;
    };
}

if (!String.prototype.ucIndexOf) {
    String.prototype.ucIndexOf = function (searchStr, ucStart) {
        if (isNaN(ucStart)){
            ucStart = 0;
        }
        if (ucStart < 0){
            ucStart = 0;
        }
        var str = String(this);
        var strUCLength = str.ucLength();
        searchStr = String(searchStr);
        var ucSearchLength = searchStr.ucLength();
        var i = ucStart;
        while (i < strUCLength){
            var ucSlice = str.ucSlice(i,i+ucSearchLength);
            if (ucSlice == searchStr){
                return i;
            }
            i++;
        }
        return -1;
    };
}

if (!String.prototype.ucLastIndexOf) {
    String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
        var str = String(this);
        var strUCLength = str.ucLength();
        if (isNaN(ucStart)){
            ucStart = strUCLength - 1;
        }
        if (ucStart >= strUCLength){
            ucStart = strUCLength - 1;
        }
        searchStr = String(searchStr);
        var ucSearchLength = searchStr.ucLength();
        var i = ucStart;
        while (i >= 0){
            var ucSlice = str.ucSlice(i,i+ucSearchLength);
            if (ucSlice == searchStr){
                return i;
            }
            i--;
        }
        return -1;
    };
}

if (!String.prototype.ucSlice) {
    String.prototype.ucSlice = function (ucStart, ucStop) {
        var str = String(this);
        var strUCLength = str.ucLength();
        if (isNaN(ucStart)){
            ucStart = 0;
        }
        if (ucStart < 0){
            ucStart = strUCLength + ucStart;
            if (ucStart < 0){ ucStart = 0;}
        }
        if (typeof(ucStop) == 'undefined'){
            ucStop = strUCLength - 1;
        }
        if (ucStop < 0){
            ucStop = strUCLength + ucStop;
            if (ucStop < 0){ ucStop = 0;}
        }
        var ucChars = [];
        var i = ucStart;
        while (i < ucStop){
            ucChars.push(str.ucCharAt(i));
            i++;
        }
        return ucChars.join("");
    };
}

if (!String.prototype.ucSplit) {
    String.prototype.ucSplit = function (delimeter, limit) {
        var str = String(this);
        var strUCLength = str.ucLength();
        var ucChars = [];
        if (delimeter == ''){
            for (var i = 0; i < strUCLength; i++){
                ucChars.push(str.ucCharAt(i));
            }
            ucChars = ucChars.slice(0, 0 + limit);
        } else{
            ucChars = str.split(delimeter, limit);
        }
        return ucChars;
    };
}

Solution 3

More recent JavaScript engines have String.fromCodePoint.

const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP

Also a code-point iterator, which gets you the code-point length.

function countCodePoints( str )
{
    const i = str[Symbol.iterator]();
    let count = 0;
    while( !i.next().done ) ++count;
    return count;
}

console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Share:
12,353
Delan Azabani
Author by

Delan Azabani

I’m a computer programmer based in Perth, Australia. My passions include all kinds of systems programming from Rust to x86 assembly and beyond, internationalisation and Unicode, digital preservation and archaeology, typography, cryptography, and above all, teaching. I’m also a trans woman (she/her pronouns)!

Updated on June 22, 2022

Comments

  • Delan Azabani
    Delan Azabani almost 2 years

    BMP being Basic Multilingual Plane

    According to JavaScript: the Good Parts:

    JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.

    This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.

    Further investigation confirms this:

    > String.fromCharCode(0x20001);
    

    The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.

    Question: is it at all possible to handle post-BMP characters in JavaScript?


    2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:

  • Delan Azabani
    Delan Azabani over 13 years
    Thank you very much. That's a great, detailed answer.
  • Mathias Bynens
    Mathias Bynens over 12 years
    @bobince So, technically, does JS use UCS-2 or UTF-16? UCS-2 doesn’t support characters outside the BMP, but JavaScript does if the individual surrogate halves are entered individually (e.g. '\uD834\uDD1E' for U+1D11E). But does that make it UTF-16?
  • Amit Patil
    Amit Patil over 12 years
    @Mathias: JavaScript is UTF-16-ignorant. It gives you a sequence of 16-bit code units and lets you put what you like in it. You can store surrogates in it if you want, but you won't get any special features to handle them as characters. Whether you want to describe that as ‘using’ UCS-2 or UTF-16 is a semantic argument to which there is not one definitive answer. However regardless of language-level support in JS, other parts of the browser do support surrogates for rendering/interation in the UI, so it makes some sense to include them in JS strings.
  • Mathias Bynens
    Mathias Bynens over 12 years
    @bobince Thanks! I looked into it a bit further and have written up my findings here: mathiasbynens.be/notes/javascript-encoding Feedback welcome.
  • Amit Patil
    Amit Patil over 10 years
    (Updated fromCodePoint to match the name proposed for ECMAScript 6's support for proper Unicode. This is now effectively a polyfill.)
  • Ahmed Fasih
    Ahmed Fasih over 9 years
    Many thanks for releasing into public domain. You, sir/madam, are a gentleman/woman and a scholar.
  • Ahmed Fasih
    Ahmed Fasih over 9 years
    ucCharAt seems to be broken. "🌔🌖🐺🐶🍄".ucCharAt(0) returns the correct value but change the 0 to a 1 and it returns gibberish. Change it to 2 and it returns the second (instead of the first) symbol. So to get to the last symbol, you have to call ucCharAt(8) which is larger than the string's ucLength.
  • hippietrail
    hippietrail almost 7 years
    Changing my vote since this answer is now out of date in saying "There is no language-level support for handling full characters". There is now some language level full character support, such as codePointAt, fromCodePoint, Array.from(), /u, for ... of, the ... operator. Perhaps others?