delphi html decode

14,821

Solution 1

The HttpApp.HttpDecode function doesn't decode http entities (https://www.w3.org/TR/html4/sgml/entities.html#sym)

For example: ∴

function HtmlDecode(s: UnicodeString): UnicodeString;
{ 
   Public domain: No attribution required
   Known issue, it doesn't handle entities with characters code points above $FFFF (65536)
   e.g.: &𒀇𐏁𐎥;

   That's because UTF-16 requires 2 characters to encode one character.
 }

    function UCS4CharToString(uch: UCS4Char): UnicodeString;
    var
        s: UCS4String;
    begin
        SetLength(s, 2);
        s[0] := uch;
        s[1] := 0; //null terminator
        Result := UCS4StringToUnicodeString(s);
    end;

    function GetCharRef(sValue: UnicodeString; StartIndex: Integer; out CharRef: string): UnicodeString;
    var
        i: Integer;
        len: Integer;
        nChar: UCS4Char;
    begin
        {
            Character references come in either decimal or hex forms:

                ♦   //decimal
                ♦  //hexidecimal

            As per the definition:

                CharRef  ::=  '&#' [0-9]+ ';'
                                  |
                                  '&#x' [0-9a-fA-F]+ ';'
        }
        Result := '';
        CharRef := '';

        len := Length(sValue) - StartIndex + 1;
        if len < 4 then
            Exit;
        i := StartIndex;
        if sValue[i] <> '&' then Exit;
        Inc(i);
        if sValue[i] <> '#' then Exit;
        Inc(i);

        if sValue[i] = 'x' then
        begin
            {
                Hex character reference

                    CharRef ::= '&#x' [0-9a-fA-F]+ ';'

                E.g. &#x2666;
            }
            Inc(i); //skip the x
            while CharInSet(sValue[i], ['0'..'9', 'a'..'f', 'A'..'F']) do
            begin
                Inc(i);
                if i > Length(sValue) then
                    Exit;
            end;
            if sValue[i] <> ';' then
                Exit;

            charRef := Copy(sValue, StartIndex, (i-StartIndex)+1);
            nChar := StrToInt('$'+Copy(charRef, 4, Length(charRef)-4));
        end
        else
        begin
            {
                Decimal character reference

                    CharRef  ::=  '&#' [0-9]+ ';'

                E.g. &#9830;
            }

            while CharInSet(sValue[i], ['0'..'9']) do
            begin
                Inc(i);
                if i > Length(sValue) then
                    Exit;
            end;
            if sValue[i] <> ';' then
                Exit;

            charRef := Copy(sValue, StartIndex, (i-StartIndex)+1);
            nChar := StrToInt(Copy(charRef, 3, Length(charRef)-3));
        end;
        Result := UCS4CharToString(nChar);
    end;

    function GetEntityRef(sValue: string; StartIndex: Integer; out CharRef: string): UnicodeString;

        function IsNameStartChar(ch: WideChar): Boolean;
        begin
            {
                NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
            }
            Result := False;

            case ch of
            ':', 'A'..'Z', '_', 'a'..'z', #$C0..#$D6, #$D8..#$F6, #$F8..#$FF: Result := True;
            #$100..#$2FF, #$370..#$37D, #$37F..#$FFF: Result := True;
            #$1000..#$1FFF, #$200C..#$200D, #$2070..#$218F, #$2C00..#$2FEF, #$3001..#$D7FF, #$F900..#$FDCF, #$FDF0..#$FFFD: Result := True;
            else
                //We assume strings are UTF-16. But by assuming one 16-bit word is the same as one character is just wrong.
                //UTF-16, like UTF-8 can be multi-byte.
                //But it's just so haaaard to support.
                //The correct action is to convert the string to UCS4, where one code-point is always one character.
                case Integer(ch) of
                $10000..$EFFFF: Result := True;
                end;
            end;
        end;

        function IsNameChar(ch: WideChar): Boolean;
        begin
            if IsNameStartChar(ch) then
            begin
                Result := True;
                Exit;
            end;

            case ch of
            '-', '.', '0'..'9', #$B7, #$0300..#$036F, #$203F..#$2040: Result := True;
            else
                Result := False;
            end;
        end;

        type
            THtmlEntity = record
                entity: string;
                ch: UCS4Char;
            end;
        const
            //https://www.w3.org/TR/html4/sgml/entities.html#sym
            //html entities are case sensitive (e.g. "larr" is different from "lArr")
            HtmlEntities: array[0..252] of THtmlEntity = (
                (entity: 'apos';        ch: 39; ), // apostrophe (originally only existed in xml, and not in HTML. Was added to HTML5
                (entity: 'quot';        ch: 34; ),  // quotation mark = APL quote, U+0022
                (entity: 'amp';     ch: 38; ),  // ampersand, U+0026
                (entity: 'lt';          ch: 60; ),  // less-than sign, U+003C
                (entity: 'gt';          ch: 62; ),  // greater-than sign, U+003E
                (entity: 'OElig';       ch: 338;    ),  // latin capital ligature OE, U+0152
                (entity: 'oelig';       ch: 339;    ),  // latin small ligature oe, U+0153
                (entity: 'Scaron';  ch: 352;    ),  // latin capital letter S with caron, U+0160
                (entity: 'scaron';  ch: 353;    ),  // latin small letter s with caron, U+0161
                (entity: 'Yuml';        ch: 376;    ),  // latin capital letter Y with diaeresis, U+0178
                (entity: 'circ';        ch: 710;    ),  // modifier letter circumflex accent, U+02C6
                (entity: 'tilde';       ch: 732;    ),  // small tilde, U+02DC
                (entity: 'nbsp';        ch: 160;    ),  // no-break space = non-breaking space,    U+00A0
                (entity: 'iexcl';       ch: 161;    ),  // inverted exclamation mark, U+00A1
                (entity: 'cent';        ch: 162;    ),  // cent sign, U+00A2
                (entity: 'pound';       ch: 163;    ),  // pound sign, U+00A3
                (entity: 'curren';  ch: 164;    ),  // currency sign, U+00A4
                (entity: 'yen';     ch: 165;    ),  // yen sign = yuan sign, U+00A5
                (entity: 'brvbar';  ch: 166;    ),  // broken bar = broken vertical bar,    U+00A6
                (entity: 'sect';        ch: 167;    ),  // section sign, U+00A7
                (entity: 'uml';     ch: 168;    ),  // diaeresis = spacing diaeresis,    U+00A8
                (entity: 'copy';        ch: 169;    ),  // copyright sign, U+00A9
                (entity: 'ordf';        ch: 170;    ),  // feminine ordinal indicator, U+00AA
                (entity: 'laquo';       ch: 171;    ),  // left-pointing double angle quotation mark = left pointing guillemet, U+00AB
                (entity: 'not';     ch: 172;    ),  // not sign, U+00AC
                (entity: 'shy';     ch: 173;    ),  // soft hyphen = discretionary hyphen,    U+00AD
                (entity: 'reg';     ch: 174;    ),  // registered sign = registered trade mark sign,    U+00AE
                (entity: 'macr';        ch: 175;    ),  // macron = spacing macron = overline  = APL overbar, U+00AF
                (entity: 'deg';     ch: 176;    ),  // degree sign, U+00B0
                (entity: 'plusmn';  ch: 177;    ),  // plus-minus sign = plus-or-minus sign,    U+00B1
                (entity: 'sup2';        ch: 178;    ),  // superscript two = superscript digit two  = squared, U+00B2
                (entity: 'sup3';        ch: 179;    ),  // superscript three = superscript digit three  = cubed, U+00B3
                (entity: 'acute';       ch: 180;    ),  // acute accent = spacing acute,    U+00B4
                (entity: 'micro';       ch: 181;    ),  // micro sign, U+00B5
                (entity: 'para';        ch: 182;    ),  // pilcrow sign = paragraph sign,    U+00B6
                (entity: 'middot';  ch: 183;    ),  // middle dot = Georgian comma = Greek middle dot, U+00B7
                (entity: 'cedil';       ch: 184;    ),  // cedilla = spacing cedilla, U+00B8
                (entity: 'sup1';        ch: 185;    ),  // superscript one = superscript digit one,    U+00B9
                (entity: 'ordm';        ch: 186;    ),  // masculine ordinal indicator,    U+00BA
                (entity: 'raquo';       ch: 187;    ),  // right-pointing double angle quotation mark =  right pointing guillemet, U+00BB
                (entity: 'frac14';  ch: 188;    ),  // vulgar fraction one quarter  = fraction one quarter, U+00BC
                (entity: 'frac12';  ch: 189;    ),  // vulgar fraction one half  = fraction one half, U+00BD
                (entity: 'frac34';  ch: 190;    ),  // vulgar fraction three quarters  = fraction three quarters, U+00BE
                (entity: 'iquest';  ch: 191;    ),  // inverted question mark  = turned question mark, U+00BF
                (entity: 'Agrave';  ch: 192;    ),  // latin capital letter A with grave  = latin capital letter A grave,    U+00C0
                (entity: 'Aacute';  ch: 193;    ),  // latin capital letter A with acute,    U+00C1
                (entity: 'Acirc';       ch: 194;    ),  // latin capital letter A with circumflex,    U+00C2
                (entity: 'Atilde';  ch: 195;    ),  // latin capital letter A with tilde,    U+00C3
                (entity: 'Auml';        ch: 196;    ),  // latin capital letter A with diaeresis,    U+00C4
                (entity: 'Aring';       ch: 197;    ),  // latin capital letter A with ring above  = latin capital letter A ring,    U+00C5
                (entity: 'AElig';       ch: 198;    ),  // latin capital letter AE  = latin capital ligature AE,    U+00C6
                (entity: 'Ccedil';  ch: 199;    ),  // latin capital letter C with cedilla,    U+00C7
                (entity: 'Egrave';  ch: 200;    ),  // latin capital letter E with grave,    U+00C8
                (entity: 'Eacute';  ch: 201;    ),  // latin capital letter E with acute,    U+00C9
                (entity: 'Ecirc';       ch: 202;    ),  // latin capital letter E with circumflex,    U+00CA
                (entity: 'Euml';        ch: 203;    ),  // latin capital letter E with diaeresis,    U+00CB
                (entity: 'Igrave';  ch: 204;    ),  // latin capital letter I with grave,    U+00CC
                (entity: 'Iacute';  ch: 205;    ),  // latin capital letter I with acute,    U+00CD
                (entity: 'Icirc';       ch: 206;    ),  // latin capital letter I with circumflex,    U+00CE
                (entity: 'Iuml';        ch: 207;    ),  // latin capital letter I with diaeresis,    U+00CF
                (entity: 'ETH';     ch: 208;    ),  // latin capital letter ETH, U+00D0
                (entity: 'Ntilde';  ch: 209;    ),  // latin capital letter N with tilde,    U+00D1
                (entity: 'Ograve';  ch: 210;    ),  // latin capital letter O with grave,    U+00D2
                (entity: 'Oacute';  ch: 211;    ),  // latin capital letter O with acute,    U+00D3
                (entity: 'Ocirc';       ch: 212;    ),  // latin capital letter O with circumflex,    U+00D4
                (entity: 'Otilde';  ch: 213;    ),  // latin capital letter O with tilde,    U+00D5
                (entity: 'Ouml';        ch: 214;    ),  // latin capital letter O with diaeresis,    U+00D6
                (entity: 'times';       ch: 215;    ),  // multiplication sign, U+00D7
                (entity: 'Oslash';  ch: 216;    ),  // latin capital letter O with stroke  = latin capital letter O slash,    U+00D8
                (entity: 'Ugrave';  ch: 217;    ),  // latin capital letter U with grave,    U+00D9
                (entity: 'Uacute';  ch: 218;    ),  // latin capital letter U with acute,    U+00DA
                (entity: 'Ucirc';       ch: 219;    ),  // latin capital letter U with circumflex,    U+00DB
                (entity: 'Uuml';        ch: 220;    ),  // latin capital letter U with diaeresis,    U+00DC
                (entity: 'Yacute';  ch: 221;    ),  // latin capital letter Y with acute,    U+00DD
                (entity: 'THORN';       ch: 222;    ),  // latin capital letter THORN,    U+00DE
                (entity: 'szlig';       ch: 223;    ),  // latin small letter sharp s = ess-zed,    U+00DF
                (entity: 'agrave';  ch: 224;    ),  // latin small letter a with grave  = latin small letter a grave,    U+00E0
                (entity: 'aacute';  ch: 225;    ),  // latin small letter a with acute,    U+00E1
                (entity: 'acirc';       ch: 226;    ),  // latin small letter a with circumflex,    U+00E2
                (entity: 'atilde';  ch: 227;    ),  // latin small letter a with tilde,    U+00E3
                (entity: 'auml';        ch: 228;    ),  // latin small letter a with diaeresis,    U+00E4
                (entity: 'aring';       ch: 229;    ),  // latin small letter a with ring above  = latin small letter a ring,    U+00E5
                (entity: 'aelig';       ch: 230;    ),  // latin small letter ae  = latin small ligature ae, U+00E6
                (entity: 'ccedil';  ch: 231;    ),  // latin small letter c with cedilla,    U+00E7
                (entity: 'egrave';  ch: 232;    ),  // latin small letter e with grave,    U+00E8
                (entity: 'eacute';  ch: 233;    ),  // latin small letter e with acute,    U+00E9
                (entity: 'ecirc';       ch: 234;    ),  // latin small letter e with circumflex,    U+00EA
                (entity: 'euml';        ch: 235;    ),  // latin small letter e with diaeresis,    U+00EB
                (entity: 'igrave';  ch: 236;    ),  // latin small letter i with grave,    U+00EC
                (entity: 'iacute';  ch: 237;    ),  // latin small letter i with acute,    U+00ED
                (entity: 'icirc';       ch: 238;    ),  // latin small letter i with circumflex,    U+00EE
                (entity: 'iuml';        ch: 239;    ),  // latin small letter i with diaeresis,    U+00EF
                (entity: 'eth';     ch: 240;    ),  // latin small letter eth, U+00F0
                (entity: 'ntilde';  ch: 241;    ),  // latin small letter n with tilde,    U+00F1
                (entity: 'ograve';  ch: 242;    ),  // latin small letter o with grave,    U+00F2
                (entity: 'oacute';  ch: 243;    ),  // latin small letter o with acute,    U+00F3
                (entity: 'ocirc';       ch: 244;    ),  // latin small letter o with circumflex,    U+00F4
                (entity: 'otilde';  ch: 245;    ),  // latin small letter o with tilde,    U+00F5
                (entity: 'ouml';        ch: 246;    ),  // latin small letter o with diaeresis,    U+00F6
                (entity: 'divide';  ch: 247;    ),  // division sign, U+00F7
                (entity: 'oslash';  ch: 248;    ),  // latin small letter o with stroke,    = latin small letter o slash,    U+00F8
                (entity: 'ugrave';  ch: 249;    ),  // latin small letter u with grave,    U+00F9
                (entity: 'uacute';  ch: 250;    ),  // latin small letter u with acute,    U+00FA
                (entity: 'ucirc';       ch: 251;    ),  // latin small letter u with circumflex,    U+00FB
                (entity: 'uuml';        ch: 252;    ),  // latin small letter u with diaeresis,    U+00FC
                (entity: 'yacute';  ch: 253;    ),  // latin small letter y with acute,    U+00FD
                (entity: 'thorn';       ch: 254;    ),  // latin small letter thorn,    U+00FE
                (entity: 'yuml';        ch: 255;    ),  // latin small letter y with diaeresis,    U+00FF
                (entity: 'fnof';        ch: 402;    ),  // latin small f with hook = function  = florin, U+0192
                (entity: 'Alpha';       ch: 913;    ),  // greek capital letter alpha, U+0391
                (entity: 'Beta';        ch: 914;    ),  // greek capital letter beta, U+0392
                (entity: 'Gamma';       ch: 915;    ),  // greek capital letter gamma,    U+0393
                (entity: 'Delta';       ch: 916;    ),  // greek capital letter delta,    U+0394
                (entity: 'Epsilon'; ch: 917;    ),  // greek capital letter epsilon, U+0395
                (entity: 'Zeta';        ch: 918;    ),  // greek capital letter zeta, U+0396
                (entity: 'Eta';     ch: 919;    ),  // greek capital letter eta, U+0397
                (entity: 'Theta';       ch: 920;    ),  // greek capital letter theta,    U+0398
                (entity: 'Iota';        ch: 921;    ),  // greek capital letter iota, U+0399
                (entity: 'Kappa';       ch: 922;    ),  // greek capital letter kappa, U+039A
                (entity: 'Lambda';  ch: 923;    ),  // greek capital letter lambda,    U+039B
                (entity: 'Mu';          ch: 924;    ),  // greek capital letter mu, U+039C
                (entity: 'Nu';          ch: 925;    ),  // greek capital letter nu, U+039D
                (entity: 'Xi';          ch: 926;    ),  // greek capital letter xi, U+039E
                (entity: 'Omicron'; ch: 927;    ),  // greek capital letter omicron, U+039F
                (entity: 'Pi';          ch: 928;    ),  // greek capital letter pi, U+03A0
                (entity: 'Rho';     ch: 929;    ),  // greek capital letter rho, U+03A1
                // there is no Sigmaf, and no U+03A2 character either
                (entity: 'Sigma';       ch: 931;    ),  // greek capital letter sigma,    U+03A3
                (entity: 'Tau';     ch: 932;    ),  // greek capital letter tau, U+03A4
                (entity: 'Upsilon'; ch: 933;    ),  // greek capital letter upsilon,    U+03A5
                (entity: 'Phi';     ch: 934;    ),  // greek capital letter phi,    U+03A6
                (entity: 'Chi';     ch: 935;    ),  // greek capital letter chi, U+03A7
                (entity: 'Psi';     ch: 936;    ),  // greek capital letter psi,    U+03A8
                (entity: 'Omega';       ch: 937;    ),  // greek capital letter omega,    U+03A9
                (entity: 'alpha';       ch: 945;    ),  // greek small letter alpha,    U+03B1
                (entity: 'beta';        ch: 946;    ),  // greek small letter beta, U+03B2
                (entity: 'gamma';       ch: 947;    ),  // greek small letter gamma,    U+03B3
                (entity: 'delta';       ch: 948;    ),  // greek small letter delta,    U+03B4
                (entity: 'epsilon'; ch: 949;    ),  // greek small letter epsilon,    U+03B5
                (entity: 'zeta';        ch: 950;    ),  // greek small letter zeta, U+03B6
                (entity: 'eta';     ch: 951;    ),  // greek small letter eta, U+03B7
                (entity: 'theta';       ch: 952;    ),  // greek small letter theta,    U+03B8
                (entity: 'iota';        ch: 953;    ),  // greek small letter iota, U+03B9
                (entity: 'kappa';       ch: 954;    ),  // greek small letter kappa,    U+03BA
                (entity: 'lambda';  ch: 955;    ),  // greek small letter lambda,    U+03BB
                (entity: 'mu';          ch: 956;    ),  // greek small letter mu, U+03BC
                (entity: 'nu';          ch: 957;    ),  // greek small letter nu, U+03BD
                (entity: 'xi';          ch: 958;    ),  // greek small letter xi, U+03BE
                (entity: 'omicron'; ch: 959;    ),  // greek small letter omicron, U+03BF NEW
                (entity: 'pi';          ch: 960;    ),  // greek small letter pi, U+03C0
                (entity: 'rho';     ch: 961;    ),  // greek small letter rho, U+03C1
                (entity: 'sigmaf';  ch: 962;    ),  // greek small letter final sigma,    U+03C2
                (entity: 'sigma';       ch: 963;    ),  // greek small letter sigma,    U+03C3
                (entity: 'tau';     ch: 964;    ),  // greek small letter tau, U+03C4
                (entity: 'upsilon'; ch: 965;    ),  // greek small letter upsilon,    U+03C5
                (entity: 'phi';     ch: 966;    ),  // greek small letter phi, U+03C6
                (entity: 'chi';     ch: 967;    ),  // greek small letter chi, U+03C7
                (entity: 'psi';     ch: 968;    ),  // greek small letter psi, U+03C8
                (entity: 'omega';       ch: 969;    ),  // greek small letter omega,    U+03C9
                (entity: 'thetasym';    ch: 977;    ),  // greek small letter theta symbol,    U+03D1 NEW
                (entity: 'upsih';       ch: 978;    ),  // greek upsilon with hook symbol,    U+03D2 NEW
                (entity: 'piv';     ch: 982;    ),  // greek pi symbol, U+03D6
                (entity: 'bull';        ch: 8226;   ),  // bullet = black small circle,  U+2022
                (entity: 'hellip';  ch: 8230;   ),  // horizontal ellipsis = three dot leader,  U+2026
                (entity: 'prime';       ch: 8242;   ),  // prime = minutes = feet, U+2032
                (entity: 'Prime';       ch: 8243;   ),  // double prime = seconds = inches,  U+2033
                (entity: 'oline';       ch: 8254;   ),  // overline = spacing overscore,  U+203E NEW
                (entity: 'frasl';       ch: 8260;   ),  // fraction slash, U+2044 NEW
                (entity: 'ensp';        ch: 8194;   ),  // en space, U+2002
                (entity: 'emsp';        ch: 8195;   ),  // em space, U+2003
                (entity: 'thinsp';  ch: 8201;   ),  // thin space, U+2009
                (entity: 'zwnj';        ch: 8204;   ),  // zero width non-joiner, U+200C NEW RFC 2070
                (entity: 'zwj';     ch: 8205;   ),  // zero width joiner, U+200D NEW RFC 2070
                (entity: 'lrm';     ch: 8206;   ),  // left-to-right mark, U+200E NEW RFC 2070
                (entity: 'rlm';     ch: 8207;   ),  // right-to-left mark, U+200F NEW RFC 2070
                (entity: 'ndash';       ch: 8211;   ),  // en dash, U+2013
                (entity: 'mdash';       ch: 8212;   ),  // em dash, U+2014
                (entity: 'lsquo';       ch: 8216;   ),  // left single quotation mark, U+2018
                (entity: 'rsquo';       ch: 8217;   ),  // right single quotation mark, U+2019
                (entity: 'sbquo';       ch: 8218;   ),  // single low-9 quotation mark, U+201A NEW
                (entity: 'ldquo';       ch: 8220;   ),  // left double quotation mark, U+201C
                (entity: 'rdquo';       ch: 8221;   ),  // right double quotation mark, U+201D
                (entity: 'bdquo';       ch: 8222;   ),  // double low-9 quotation mark, U+201E NEW
                (entity: 'dagger';  ch: 8224;   ),  // dagger, U+2020
                (entity: 'Dagger';  ch: 8225;   ),  // double dagger, U+2021
                (entity: 'permil';  ch: 8240;   ),  // per mille sign, U+2030
                (entity: 'lsaquo';  ch: 8249;   ),  // single left-pointing angle quotation mark, U+2039
                (entity: 'rsaquo';  ch: 8250;   ),  // single right-pointing angle quotation mark, U+203A
                (entity: 'euro';        ch: 8364;   ),  // euro sign, U+20AC NEW
                (entity: 'weierp';  ch: 8472;   ),  // script capital P = power set   = Weierstrass p, U+2118
                (entity: 'image';       ch: 8465;   ),  // blackletter capital I = imaginary part,  U+2111
                (entity: 'real';        ch: 8476;   ),  // blackletter capital R = real part symbol,  U+211C
                (entity: 'trade';       ch: 8482;   ),  // trade mark sign, U+2122
                (entity: 'alefsym'; ch: 8501;   ),  // alef symbol = first transfinite cardinal,  U+2135 NEW  (alef symbol is NOT the same as hebrew letter alef, U+05D0 although the same glyph could be used to depict both characters)
                (entity: 'larr';        ch: 8592;   ),  // leftwards arrow, U+2190
                (entity: 'uarr';        ch: 8593;   ),  // upwards arrow, U+2191
                (entity: 'rarr';        ch: 8594;   ),  // rightwards arrow, U+2192
                (entity: 'darr';        ch: 8595;   ),  // downwards arrow, U+2193
                (entity: 'harr';        ch: 8596;   ),  // left right arrow, U+2194
                (entity: 'crarr';       ch: 8629;   ),  // downwards arrow with corner leftwards   = carriage return, U+21B5 NEW
                (entity: 'lArr';        ch: 8656;   ),  // leftwards double arrow, U+21D0
                (entity: 'uArr';        ch: 8657;   ),  // upwards double arrow, U+21D1
                (entity: 'rArr';        ch: 8658;   ),  // rightwards double arrow,  U+21D2
                (entity: 'dArr';        ch: 8659;   ),  // downwards double arrow, U+21D3
                (entity: 'hArr';        ch: 8660;   ),  // left right double arrow,  U+21D4
                (entity: 'forall';  ch: 8704;   ),  // for all, U+2200
                (entity: 'part';        ch: 8706;   ),  // partial differential, U+2202
                (entity: 'exist';       ch: 8707;   ),  // there exists, U+2203
                (entity: 'empty';       ch: 8709;   ),  // empty set = null set = diameter,  U+2205
                (entity: 'nabla';       ch: 8711;   ),  // nabla = backward difference,  U+2207
                (entity: 'isin';        ch: 8712;   ),  // element of, U+2208
                (entity: 'notin';       ch: 8713;   ),  // not an element of, U+2209
                (entity: 'ni';          ch: 8715;   ),  // contains as member, U+220B
                (entity: 'prod';        ch: 8719;   ),  // n-ary product = product sign,  U+220F
                (entity: 'sum';     ch: 8721;   ),  // n-ary sumation, U+2211
                (entity: 'minus';       ch: 8722;   ),  // minus sign, U+2212
                (entity: 'lowast';  ch: 8727;   ),  // asterisk operator, U+2217
                (entity: 'radic';       ch: 8730;   ),  // square root = radical sign,  U+221A
                (entity: 'prop';        ch: 8733;   ),  // proportional to, U+221D
                (entity: 'infin';       ch: 8734;   ),  // infinity, U+221E
                (entity: 'ang';     ch: 8736;   ),  // angle, U+2220
                (entity: 'and';     ch: 8743;   ),  // logical and = wedge, U+2227
                (entity: 'or';          ch: 8744;   ),  // logical or = vee, U+2228
                (entity: 'cap';     ch: 8745;   ),  // intersection = cap, U+2229
                (entity: 'cup';     ch: 8746;   ),  // union = cup, U+222A
                (entity: 'int';     ch: 8747;   ),  // integral, U+222B
                (entity: 'there4';  ch: 8756;   ),  // therefore, U+2234
                (entity: 'sim';     ch: 8764;   ),  // tilde operator = varies with = similar to,  U+223C
                (entity: 'cong';        ch: 8773;   ),  // approximately equal to, U+2245
                (entity: 'asymp';       ch: 8776;   ),  // almost equal to = asymptotic to,  U+2248
                (entity: 'ne';          ch: 8800;   ),  // not equal to, U+2260
                (entity: 'equiv';       ch: 8801;   ),  // identical to, U+2261
                (entity: 'le';          ch: 8804;   ),  // less-than or equal to, U+2264
                (entity: 'ge';          ch: 8805;   ),  // greater-than or equal to,  U+2265
                (entity: 'sub';     ch: 8834;   ),  // subset of, U+2282
                (entity: 'sup';     ch: 8835;   ),  // superset of, U+2283
                (entity: 'nsub';        ch: 8836;   ),  // not a subset of, U+2284
                (entity: 'sube';        ch: 8838;   ),  // subset of or equal to, U+2286
                (entity: 'supe';        ch: 8839;   ),  // superset of or equal to,  U+2287
                (entity: 'oplus';       ch: 8853;   ),  // circled plus = direct sum,  U+2295
                (entity: 'otimes';  ch: 8855;   ),  // circled times = vector product,  U+2297
                (entity: 'perp';        ch: 8869;   ),  // up tack = orthogonal to = perpendicular,  U+22A5
                (entity: 'sdot';        ch: 8901;   ),  // dot operator, U+22C5
                (entity: 'lceil';       ch: 8968;   ),  // left ceiling = apl upstile,  U+2308
                (entity: 'rceil';       ch: 8969;   ),  // right ceiling, U+2309
                (entity: 'lfloor';  ch: 8970;   ),  // left floor = apl downstile,  U+230A
                (entity: 'rfloor';  ch: 8971;   ),  // right floor, U+230B
                (entity: 'lang';        ch: 9001;   ),  // left-pointing angle bracket = bra,  U+2329
                (entity: 'rang';        ch: 9002;   ),  // right-pointing angle bracket = ket,  U+232A
                (entity: 'loz';     ch: 9674;   ),  // lozenge, U+25CA
                (entity: 'spades';  ch: 9824;   ),  // black spade suit, U+2660
                (entity: 'clubs';       ch: 9827;   ),  // black club suit = shamrock,  U+2663
                (entity: 'hearts';  ch: 9829;   ),  // black heart suit = valentine,  U+2665
                (entity: 'diams';       ch: 9830;   )   // black diamond suit, U+2666
            );


    var
        i: Integer;
        len: Integer;
        nChar: UCS4Char;
        runEntity: string;
    begin
        {
            EntityRef  ::=  '&' Name ';'

                Name    ::=  NameStartChar (NameChar)*

                    NameStartChar  ::=  ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
                    NameChar          ::=  NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
        }
        Result := '';
        CharRef := '';

        len := Length(sValue) - StartIndex + 1;
        if len < 4 then
            Exit;
        i := StartIndex;
        if sValue[i] <> '&' then Exit;
        Inc(i);

        if not IsNameStartChar(sValue[i]) then
            Exit;

        Inc(i);
        while IsNameChar(sValue[i]) do
        begin
            Inc(i);
            if i > Length(sValue) then
                Exit;
        end;
        if sValue[i] <> ';' then
            Exit;

        charRef := Copy(sValue, StartIndex, (i-StartIndex)+1);

        //Strip off the & and ;
        runEntity := Copy(charRef, 2, Length(charRef)-2);

        for i := Low(HtmlEntities) to High(HtmlEntities) do
        begin
            //Case sensitive check; as entites are case sensitive
            if runEntity = HtmlEntities[i].entity then
            begin
                nChar := HtmlEntities[i].ch;
                Result := UCS4CharToString(nChar);
                Exit;
            end;
        end;

        //It looks like a valid entity reference, but we don't recognize the text.
        //It's probably garbage that we might be able to fix
        if IsDebuggerPresent then
            OutputDebugString(PChar('HtmlDecode: Unknown HTML entity reference: "'+charRef+'"'));
    end;

var
    i: Integer;
    entity: UnicodeString;
    entityChar: UnicodeString;
begin
    i := 1;
    Result := '';

    while i <= Length(s) do
    begin
        if s[i] <> '&' then
        begin
            Result := Result + s[i];
            Inc(i);
            Continue;
        end;

        entityChar := GetCharRef(s, i, {out}entity);
        if entityChar <> '' then
        begin
            Result := Result + entityChar;
            Inc(i, Length(entity));
            Continue;
        end;

        entityChar := GetEntityRef(s, i, {out}entity);
        if entityChar <> '' then
        begin
            Result := Result + entityChar;
            Inc(i, Length(entity));
            Continue;
        end;

        Result := Result + s[i];
        Inc(i);
    end;
end;

Solution 2

Here's my HTMLDecode procedure (slightly modified from CGs HTTPApp unit):

function HTMLDecode(const AStr: String): String;
var
  Sp, Rp, Cp, Tp: PChar;
  S: String;
  I, Code: Integer;
begin
  SetLength(Result, Length(AStr));
  Sp := PChar(AStr);
  Rp := PChar(Result);
  Cp := Sp;
  try
    while Sp^ <> #0 do
    begin
      case Sp^ of
        '&': begin
               Cp := Sp;
               Inc(Sp);
               case Sp^ of
                 'a': if AnsiStrPos(Sp, 'amp;') = Sp then  { do not localize }
                      begin
                        Inc(Sp, 3);
                        Rp^ := '&';
                      end;
                 'l',
                 'g': if (AnsiStrPos(Sp, 'lt;') = Sp) or (AnsiStrPos(Sp, 'gt;') = Sp) then { do not localize }
                      begin
                        Cp := Sp;
                        Inc(Sp, 2);
                        while (Sp^ <> ';') and (Sp^ <> #0) do
                          Inc(Sp);
                        if Cp^ = 'l' then
                          Rp^ := '<'
                        else
                          Rp^ := '>';
                      end;
                 'n': if AnsiStrPos(Sp, 'nbsp;') = Sp then  { do not localize }
                      begin
                        Inc(Sp, 4);
                        Rp^ := ' ';
                      end;
                 'q': if AnsiStrPos(Sp, 'quot;') = Sp then  { do not localize }
                      begin
                        Inc(Sp,4);
                        Rp^ := '"';
                      end;
                 '#': begin
                        Tp := Sp;
                        Inc(Tp);
                        while (Sp^ <> ';') and (Sp^ <> #0) do
                          Inc(Sp);
                        SetString(S, Tp, Sp - Tp);
                        Val(S, I, Code);
                        Rp^ := Chr((I));
                      end;
                 else
                   Exit;
               end;
           end
      else
        Rp^ := Sp^;
      end;
      Inc(Rp);
      Inc(Sp);
    end;
  except
  end;
  SetLength(Result, Rp - PChar(Result));
end;
Share:
14,821
tekBlues
Author by

tekBlues

Developing excellent, good, mediocre and plain awful software since 1982. Civil Engineer Background: Assembler / C / Basic. Lots of experience with SQL and databases of all flavours. Twitter: @tekBlues email: [email protected]

Updated on June 09, 2022

Comments

  • tekBlues
    tekBlues almost 2 years

    I'm using Delphi 2009 and want to decode an HTML encoded string, for example:

    &#39; -> '
    

    But cannot find any built in function for doing this.

    Thanks in advance

  • MarkAurelius
    MarkAurelius over 8 years
    The function of the same name in the provided unit HTTPApp doesn't handle the &nbsp; token. This is based on that code, but does.
  • smooty86
    smooty86 about 4 years
    WARNING: this function leads to buffer overflow. If the string contains "&&&", it leads to "Exit" function, this skips SetLength and Result buffer length remains with incorrect data! You have been warned.
  • AmigoJack
    AmigoJack about 4 years
    There are much more entities that exist which aren't handled by this function (i.e. &apos;, &bull;, &copy;...). Nor is #x5bce; supported
  • Andreas Rejbrand
    Andreas Rejbrand almost 4 years
    There seems to be a non-terminated comment at the beginning of the routine.