| Module | HTMLEntities |
| In: |
lib/htmlentities/htmlentities.rb
|
HTML entity encoding and decoding for Ruby
| Author: | Paul BATTLEY (pbattley @ gmail.com) |
| Version: | 2.2 |
| Date: | 2005-11-07 |
This library extends the String class to allow encoding and decoding of HTML/XML entities from/to their corresponding UTF-8 codepoints.
Copyright © 2005 Paul Battley
Usage of the works is permitted provided that this instrument is retained with the works, so that any entity that uses the works is notified of this instrument.
DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
| VERSION | = | '2.2' | ||
| MAP | = | { 'quot' => 34, 'apos' => 39, 'amp' => 38, 'lt' => 60, 'gt' => 62, 'nbsp' => 160, 'iexcl' => 161, 'curren' => 164, 'cent' => 162, 'pound' => 163, 'yen' => 165, 'brvbar' => 166, 'sect' => 167, 'uml' => 168, 'copy' => 169, 'ordf' => 170, 'laquo' => 171, 'not' => 172, 'shy' => 173, 'reg' => 174, 'trade' => 8482, 'macr' => 175, 'deg' => 176, 'plusmn' => 177, 'sup2' => 178, 'sup3' => 179, 'acute' => 180, 'micro' => 181, 'para' => 182, 'middot' => 183, 'cedil' => 184, 'sup1' => 185, 'ordm' => 186, 'raquo' => 187, 'frac14' => 188, 'frac12' => 189, 'frac34' => 190, 'iquest' => 191, 'times' => 215, 'divide' => 247, 'Agrave' => 192, 'Aacute' => 193, 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196, 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199, 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202, 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205, 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208, 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211, 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214, 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218, 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221, 'THORN' => 222, 'szlig' => 223, 'agrave' => 224, 'aacute' => 225, 'acirc' => 226, 'atilde' => 227, 'auml' => 228, 'aring' => 229, 'aelig' => 230, 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233, 'ecirc' => 234, 'euml' => 235, 'igrave' => 236, 'iacute' => 237, 'icirc' => 238, 'iuml' => 239, 'eth' => 240, 'ntilde' => 241, 'ograve' => 242, 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245, 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249, 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252, 'yacute' => 253, 'thorn' => 254, 'yuml' => 255, 'OElig' => 338, 'oelig' => 339, 'Scaron' => 352, 'scaron' => 353, 'Yuml' => 376, 'circ' => 710, 'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195, 'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205, 'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211, 'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217, 'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221, 'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225, 'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249, 'rsaquo' => 8250, 'euro' => 8364 | MAP is a hash of all the HTML entities I could discover, as taken from the w3schools page on the subject: www.w3schools.com/html/html_entitiesref.asp The format is ‘entity name’ => codepoint where entity name is given without the surrounding ampersand and semicolon. | |
| MIN_LENGTH | = | MAP.keys.map{ |a| a.length }.min | ||
| MAX_LENGTH | = | MAP.keys.map{ |a| a.length }.max | ||
| NAMED_ENTITY_REGEXP | = | /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i | Precompile the regexp | |
| REVERSE_MAP | = | MAP.invert | Reverse map for converting characters to named entities | |
| BASIC_ENTITY_REGEXP | = | /[<>'"&]/ | ||
| UTF8_NON_ASCII_REGEXP | = | /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/ |