Skip to content

Decode some html entities #835

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 9, 2016
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions src/constants/string_mappings.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/**
* Copyright 2012-2016, Plotly, Inc.
* All rights reserved.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/


'use strict';

// N.B. HTML entities are listed without the leading '&' and trailing ';'

module.exports = {

entityToUnicode: {
'mu': 'μ',
'amp': '&',
'lt': '<',
'gt': '>',
'quot': '\"',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

&quot; support makes me really uncomfortable since it's been part of so many of our XSS attacks (inserting a " sometimes allows an entity to be terminated early).

Is this definitely needed?

(The rest looks OK to me.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it isn't definitely needed. I thought it was common enough to be part of the list. I'll drop it.

'nbsp': ' ',
'times': '×',
'plusmn': '±',
'deg': '°'
},

unicodeToEntity: {
'&': 'amp',
'<': 'lt',
'>': 'gt',
'"': 'quot',
'\'': '#x27',
'\/': '#x2F'
}

};
11 changes: 3 additions & 8 deletions src/lib/html2unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,7 @@
'use strict';

var toSuperScript = require('superscript-text');

var ENTITIES = {
'mu': 'μ',
'amp': '&',
'lt': '<',
'gt': '>'
};
var stringMappings = require('../constants/string_mappings');

function fixSuperScript(x) {
var idx = 0;
Expand All @@ -40,6 +34,7 @@ function stripTags(x) {
}

function fixEntities(x) {
var entityToUnicode = stringMappings.entityToUnicode;
var idx = 0;

while((idx = x.indexOf('&', idx)) >= 0) {
Expand All @@ -49,7 +44,7 @@ function fixEntities(x) {
continue;
}

var entity = ENTITIES[x.slice(idx + 1, nidx)];
var entity = entityToUnicode[x.slice(idx + 1, nidx)];
if(entity) {
x = x.slice(0, idx) + entity + x.slice(nidx + 1);
} else {
Expand Down
41 changes: 35 additions & 6 deletions src/lib/svg_text_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ var d3 = require('d3');

var Lib = require('../lib');
var xmlnsNamespaces = require('../constants/xmlns_namespaces');
var stringMappings = require('../constants/string_mappings');

// Append SVG

Expand Down Expand Up @@ -67,6 +68,7 @@ exports.convertToTspans = function(_context, _callback) {
var str = _context.text();
var converted = convertToSVG(str);
var that = _context;

// Until we get tex integrated more fully (so it can be used along with non-tex)
// allow some elements to prohibit it by attaching 'data-notex' to the original
var tex = (!that.attr('data-notex')) && converted.match(/([^$]*)([$]+[^$]*[$]+)([^$]*)/);
Expand Down Expand Up @@ -233,22 +235,48 @@ var PROTOCOLS = ['http:', 'https:', 'mailto:'];

var STRIP_TAGS = new RegExp('</?(' + Object.keys(TAG_STYLES).join('|') + ')( [^>]*)?/?>', 'g');

var ENTITY_TO_UNICODE = Object.keys(stringMappings.entityToUnicode).map(function(k) {
return {
regExp: new RegExp('&' + k + ';', 'g'),
sub: stringMappings.entityToUnicode[k]
};
});

var UNICODE_TO_ENTITY = Object.keys(stringMappings.unicodeToEntity).map(function(k) {
return {
regExp: new RegExp(k, 'g'),
sub: '&' + stringMappings.unicodeToEntity[k] + ';'
};
});

exports.plainText = function(_str) {
// strip out our pseudo-html so we have a readable
// version to put into text fields
return (_str || '').replace(STRIP_TAGS, ' ');
};

function replaceFromMapObject(_str, list) {
var out = _str || '';

for(var i = 0; i < list.length; i++) {
var item = list[i];
out = out.replace(item.regExp, item.sub);
}

return out;
}

function convertEntities(_str) {
return replaceFromMapObject(_str, ENTITY_TO_UNICODE);
}

function encodeForHTML(_str) {
return (_str || '').replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#x27;')
.replace(/\//g, '&#x2F;');
return replaceFromMapObject(_str, UNICODE_TO_ENTITY);
}

function convertToSVG(_str) {
_str = convertEntities(_str);

var result = _str
.split(/(<[^<>]*>)/).map(function(d) {
var match = d.match(/<(\/?)([^ >]*)\s*(.*)>/i),
Expand All @@ -267,6 +295,7 @@ function convertToSVG(_str) {
* resurrect it.
*/
extraStyle = extra.match(/^style\s*=\s*"([^"]+)"\s*/i);

// anchor and br are the only ones that don't turn into a tspan
if(tag === 'a') {
if(close) return '</a>';
Expand Down