Skip to content

Commit 3860295

Browse files
committed
improve pseudo-html entity conversion
- move it later in the convertToTspans pipeline - support all numbered entities (including the correct ampersand code!)
1 parent a8c6217 commit 3860295

File tree

4 files changed

+115
-79
lines changed

4 files changed

+115
-79
lines changed

src/constants/string_mappings.js

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,21 @@
1212
// N.B. HTML entities are listed without the leading '&' and trailing ';'
1313
// https://www.freeformatter.com/html-entities.html
1414

15+
// FWIW if we wanted to support the full set, it has 2261 entries:
16+
// https://www.w3.org/TR/html5/entities.json
17+
// though I notice that some of these are duplicates and/or are missing ";"
18+
// eg: "&", "&amp", "&", and "&AMP" all map to "&"
19+
// We no longer need to include numeric entities here, these are now handled
20+
// by String.fromCodePoint/fromCharCode in svg_text_utils
1521
module.exports = {
1622
entityToUnicode: {
17-
'mu': 'μ',
18-
'#956': 'μ',
19-
20-
'amp': '&',
21-
'#28': '&',
22-
23-
'lt': '<',
24-
'#60': '<',
25-
26-
'gt': '>',
27-
'#62': '>',
28-
29-
'nbsp': ' ',
30-
'#160': ' ',
31-
32-
'times': '×',
33-
'#215': '×',
34-
35-
'plusmn': '±',
36-
'#177': '±',
37-
38-
'deg': '°',
39-
'#176': '°'
23+
mu: 'μ',
24+
amp: '&',
25+
lt: '<',
26+
gt: '>',
27+
nbsp: ' ',
28+
times: '×',
29+
plusmn: '±',
30+
deg: '°'
4031
}
4132
};

src/lib/html2unicode.js

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
'use strict';
1111

1212
var toSuperScript = require('superscript-text');
13-
var stringMappings = require('../constants/string_mappings');
13+
var fixEntities = require('./svg_text_utils').convertEntities;
1414

1515
function fixSuperScript(x) {
1616
var idx = 0;
@@ -33,28 +33,6 @@ function stripTags(x) {
3333
return x.replace(/\<.*\>/g, '');
3434
}
3535

36-
function fixEntities(x) {
37-
var entityToUnicode = stringMappings.entityToUnicode;
38-
var idx = 0;
39-
40-
while((idx = x.indexOf('&', idx)) >= 0) {
41-
var nidx = x.indexOf(';', idx);
42-
if(nidx < idx) {
43-
idx += 1;
44-
continue;
45-
}
46-
47-
var entity = entityToUnicode[x.slice(idx + 1, nidx)];
48-
if(entity) {
49-
x = x.slice(0, idx) + entity + x.slice(nidx + 1);
50-
} else {
51-
x = x.slice(0, idx) + x.slice(nidx + 1);
52-
}
53-
}
54-
55-
return x;
56-
}
57-
5836
function convertHTMLToUnicode(html) {
5937
return '' +
6038
fixEntities(

src/lib/svg_text_utils.js

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ var d3 = require('d3');
1515

1616
var Lib = require('../lib');
1717
var xmlnsNamespaces = require('../constants/xmlns_namespaces');
18-
var stringMappings = require('../constants/string_mappings');
18+
var entityToUnicode = require('../constants/string_mappings').entityToUnicode;
1919
var LINE_SPACING = require('../constants/alignment').LINE_SPACING;
2020

2121
// text converter
@@ -223,13 +223,6 @@ var PROTOCOLS = ['http:', 'https:', 'mailto:', '', undefined, ':'];
223223

224224
var STRIP_TAGS = new RegExp('</?(' + Object.keys(TAG_STYLES).join('|') + ')( [^>]*)?/?>', 'g');
225225

226-
var ENTITY_TO_UNICODE = Object.keys(stringMappings.entityToUnicode).map(function(k) {
227-
return {
228-
regExp: new RegExp('&' + k + ';', 'g'),
229-
sub: stringMappings.entityToUnicode[k]
230-
};
231-
});
232-
233226
var NEWLINES = /(\r\n?|\n)/g;
234227

235228
var SPLIT_TAGS = /(<[^<>]*>)/;
@@ -254,6 +247,14 @@ var BR_TAG = /<br(\s+.*)?>/i;
254247
*
255248
* Because we hack in other attributes with style (sub & sup), drop any trailing
256249
* semicolon in user-supplied styles so we can consistently append the tag-dependent style
250+
*
251+
* These are for tag attributes; Chrome anyway will convert entities in
252+
* attribute values, but not in attribute names
253+
* you can test this by for example:
254+
* > p = document.createElement('p')
255+
* > p.innerHTML = '<span styl&#x65;="font-color:r&#x65;d;">Hi</span>'
256+
* > p.innerHTML
257+
* <- '<span styl&#x65;="font-color:red;">Hi</span>'
257258
*/
258259
var STYLEMATCH = /(^|[\s"'])style\s*=\s*("([^"]*);?"|'([^']*);?')/i;
259260
var HREFMATCH = /(^|[\s"'])href\s*=\s*("([^"]*)"|'([^']*)')/i;
@@ -265,7 +266,8 @@ var POPUPMATCH = /(^|[\s"'])popup\s*=\s*("([\w=,]*)"|'([\w=,]*)')/i;
265266
function getQuotedMatch(_str, re) {
266267
if(!_str) return null;
267268
var match = _str.match(re);
268-
return match && (match[3] || match[4]);
269+
var result = match && (match[3] || match[4]);
270+
return result && convertEntities(result);
269271
}
270272

271273
var COLORMATCH = /(^|;)\s*color:/;
@@ -276,19 +278,45 @@ exports.plainText = function(_str) {
276278
return (_str || '').replace(STRIP_TAGS, ' ');
277279
};
278280

279-
function replaceFromMapObject(_str, list) {
280-
if(!_str) return '';
281-
282-
for(var i = 0; i < list.length; i++) {
283-
var item = list[i];
284-
_str = _str.replace(item.regExp, item.sub);
285-
}
281+
// NOTE: in general entities can contain uppercase too (so [a-zA-Z]) but all the
282+
// ones we support use only lowercase. If we ever change that, update the regex.
283+
var ENTITY_MATCH = /&(#\d+|#x[\da-fA-F]+|[a-z]+);/g;
284+
function convertEntities(_str) {
285+
return _str.replace(ENTITY_MATCH, function(fullMatch, innerMatch) {
286+
var outChar;
287+
if(innerMatch.charAt(0) === '#') {
288+
// cannot use String.fromCodePoint in IE
289+
outChar = fromCodePoint(
290+
innerMatch.charAt(1) === 'x' ?
291+
parseInt(innerMatch.substr(2), 16) :
292+
parseInt(innerMatch.substr(1), 10)
293+
);
294+
}
295+
else outChar = entityToUnicode[innerMatch];
286296

287-
return _str;
297+
// as in regular HTML, if we didn't decode the entity just
298+
// leave the raw text in place.
299+
return outChar || fullMatch;
300+
});
288301
}
289-
290-
function convertEntities(_str) {
291-
return replaceFromMapObject(_str, ENTITY_TO_UNICODE);
302+
exports.convertEntities = convertEntities;
303+
304+
// but also in other browsers we don't want to overflow
305+
var stringFromCodePoint = String.fromCodePoint;
306+
var stringFromCharCode = String.fromCharCode;
307+
function fromCodePoint(code) {
308+
// Don't allow overflow. In Chrome this turns into � but I feel like it's
309+
// more useful to just not convert it at all.
310+
if(code > 0x10FFFF) return;
311+
if(stringFromCodePoint) return stringFromCodePoint(code);
312+
313+
// IE doesn't have String.fromCodePoint
314+
// see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCodePoint
315+
if(code <= 0xFFFF) return stringFromCharCode(code);
316+
return stringFromCharCode(
317+
(code >> 10) + 0xD7C0,
318+
(code % 0x400) + 0xDC00
319+
);
292320
}
293321

294322
/*
@@ -302,15 +330,14 @@ function convertEntities(_str) {
302330
* somewhat differently if it does, so just keep track of this when it happens.
303331
*/
304332
function buildSVGText(containerNode, str) {
305-
str = convertEntities(str)
306-
/*
307-
* Normalize behavior between IE and others wrt newlines and whitespace:pre
308-
* this combination makes IE barf https://github.com/plotly/plotly.js/issues/746
309-
* Chrome and FF display \n, \r, or \r\n as a space in this mode.
310-
* I feel like at some point we turned these into <br> but currently we don't so
311-
* I'm just going to cement what we do now in Chrome and FF
312-
*/
313-
.replace(NEWLINES, ' ');
333+
/*
334+
* Normalize behavior between IE and others wrt newlines and whitespace:pre
335+
* this combination makes IE barf https://github.com/plotly/plotly.js/issues/746
336+
* Chrome and FF display \n, \r, or \r\n as a space in this mode.
337+
* I feel like at some point we turned these into <br> but currently we don't so
338+
* I'm just going to cement what we do now in Chrome and FF
339+
*/
340+
str = str.replace(NEWLINES, ' ');
314341

315342
var hasLink = false;
316343

@@ -435,7 +462,7 @@ function buildSVGText(containerNode, str) {
435462
newLine();
436463
}
437464
else if(tagStyle === undefined) {
438-
addTextNode(currentNode, parti);
465+
addTextNode(currentNode, convertEntities(parti));
439466
}
440467
else {
441468
// tag - open or close

test/jasmine/tests/svg_text_utils_test.js

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -300,16 +300,56 @@ describe('svg+text utils', function() {
300300
'100 &times; 20 &plusmn; 0.5 &deg;'
301301
);
302302

303-
expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °');
303+
expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °');
304304
});
305305

306306
it('decodes some HTML entities in text (number case)', function() {
307307
var node = mockTextSVGElement(
308-
'100&#956; &#28; &#60; 10 &#62; 0 &#160;' +
308+
'100&#956; &#38; &#60; 10 &#62; 0 &#160;' +
309309
'100 &#215; 20 &#177; 0.5 &#176;'
310310
);
311311

312-
expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °');
312+
expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °');
313+
});
314+
315+
it('decodes arbitrary decimal and hex number entities', function() {
316+
var i = 0;
317+
for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) {
318+
var node = mockTextSVGElement(
319+
'&#x' + n.toString(16) +
320+
'; = &#' + n.toString() +
321+
'; = &#x' + n.toString(16).toUpperCase() + ';'
322+
);
323+
var char = String.fromCodePoint(n);
324+
expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n);
325+
i++;
326+
}
327+
// not really necessary to assert this, but we tested 355 characters,
328+
// weighted toward the low end but continuing all the way to the
329+
// end of the unicode definition
330+
expect(i).toBe(355);
331+
});
332+
333+
it('does not decode entities prematurely', function() {
334+
var testCases = [
335+
'&lt;b>not bold</b&gt;',
336+
'<b&gt;not bold</b&gt;',
337+
'&lt;b>not bold&lt;/b>',
338+
'<b&gt;not bold&lt;/b>',
339+
'&lt;b&gt;not bold&lt;/b&gt;'
340+
];
341+
testCases.forEach(function(testCase) {
342+
var node = mockTextSVGElement(testCase);
343+
344+
expect(node.html()).toBe(
345+
'&lt;b&gt;not bold&lt;/b&gt;', testCase
346+
);
347+
});
348+
349+
var controlNode = mockTextSVGElement('<b>bold</b>');
350+
expect(controlNode.html()).toBe(
351+
'<tspan style="font-weight:bold">bold</tspan>'
352+
);
313353
});
314354

315355
it('supports superscript by itself', function() {

0 commit comments

Comments
 (0)