| Class | String |
| In: |
lib/htmlentities/htmlentities.rb
|
| Parent: | Object |
| ENCODE_ENTITIES_COMMAND_ORDER | = | { :basic => 0, :named => 1, :decimal => 2, :hexadecimal => 3 | Because there’s no need to make the user worry about the order here, let’s handle it. |
Decode XML and HTML 4.01 entities in a string into their UTF-8 equivalents. Obviously, if your string is not already in UTF-8, you’d better convert it before using this method, or the output will be mixed up. Unknown named entities are not converted
# File lib/htmlentities/htmlentities.rb, line 203
203: def decode_entities
204: return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
205: HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
206: }.gsub(/&#([0-9]{1,7});/) {
207: [$1.to_i].pack('U')
208: }.gsub(/&#x([0-9a-f]{1,6});/i) {
209: [$1.to_i(16)].pack('U')
210: }
211: end
Encode codepoints into their corresponding entities. Various operations are possible, and may be specified in order:
| :basic : | Convert the five XML entities (’"<>&) |
| :named : | Convert non-ASCII characters to their named HTML 4.01 equivalent |
| :decimal : | Convert non-ASCII characters to decimal entities (e.g. &1234;) |
| :hexadecimal : | Convert non-ASCII characters to hexadecimal entities (e.g. # &x12ab;) |
You can specify the commands in any order, but they will be executed in the order listed above to ensure that entity ampersands are not clobbered and that named entities are replaced before numeric ones.
If no instructions are specified, :basic will be used.
Examples:
str.encode_entities - XML-safe str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all non-ASCII characters replaced with their named entity where possible, and decimal equivalents otherwise.
Note: It is the program’s responsibility to ensure that the string contains valid UTF-8 before calling this method.
# File lib/htmlentities/htmlentities.rb, line 238
238: def encode_entities(*instructions)
239: str = nil
240: if (instructions.empty?)
241: instructions = [:basic]
242: else
243: instructions.each do |instr|
244: unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
245: raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
246: end
247: end
248: instructions.sort! { |a,b|
249: ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
250: ENCODE_ENTITIES_COMMAND_ORDER[b]
251: }
252: end
253: instructions.each do |instruction|
254: case instruction
255: when :basic
256: # Handled as basic ASCII
257: str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
258: # It's safe to use the simpler [0] here because we know
259: # that the basic entities are ASCII.
260: '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
261: }
262: when :named
263: # Test everything except printable ASCII
264: str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
265: cp = $&.unpack('U')[0]
266: (e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
267: }
268: when :decimal
269: str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
270: "&##{$&.unpack('U')[0]};"
271: }
272: when :hexadecimal
273: str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
274: "&#x#{$&.unpack('U')[0].to_s(16)};"
275: }
276: end
277: end
278: return str
279: end