blob: 08521b1082fa3e8afd8afcd73a9884751022479c [file] [log] [blame]
// Copyright 2009 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @fileoverview Utility functions for BiDi support. Adapted in part from
* <http://code.google.com/p/closure-library/source/browse/trunk/closure/goog/i18n/bidi.js>.
*/
goog.provide('bidichecker.utils');
goog.provide('bidichecker.utils.Substring');
goog.require('goog.array');
goog.require('goog.dom');
goog.require('goog.dom.TagIterator');
goog.require('goog.style');
/**
* A practical pattern to identify strong LTR characters. This pattern is not
* theoretically correct according to the Unicode standard. It is simplified for
* performance and small code size. Note: LRM is treated here as a strong LTR
* character; this is important for detecting spillover errors.
* @type {string}
* @private
*/
bidichecker.utils.ltrChars_ =
'A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF' +
'\u200E\u2C00-\uFB1C\uFE00-\uFE6F\uFEFD-\uFFFF';
/**
* A practical pattern to identify neutral and weakly directional characters,
* *not* including whitespace. This pattern is not theoretically correct
* according to the Unicode standard. It is simplified for performance and small
* code size.
* @type {string}
* @private
*/
bidichecker.utils.nonWhiteNeutralChars_ =
'\u0000-\u0008\u000e-\u001f!-@[-`{-\u0084\u0086-\u009f\u00a1-\u00BF' +
'\u00D7\u00F7\u02B9-\u02FF\u200b-\u200d\u2010-\u2027\u2030-\u205e' +
'\u2060-\u2BFF';
/**
* A practical pattern to identify neutral and weakly directional characters,
* including whitespace. This pattern is not theoretically correct according to
* the Unicode standard. It is simplified for performance and small code size.
* @type {string}
* @private
*/
bidichecker.utils.neutralChars_ = bidichecker.utils.nonWhiteNeutralChars_ +
'\u0009-\u000d\u0020\u0085\u00a0\u1680\u180e\u2000-\u200a\u2028\u2029' +
'\u202f\u205f\u3000';
/**
* A practical pattern to identify strong RTL characters. This pattern is not
* theoretically correct according to the Unicode standard. It is simplified
* for performance and small code size. Note: RLM is treated here as a strong
* RTL character; this is important for detecting spillover errors.
* @type {string}
* @private
*/
bidichecker.utils.rtlChars_ = '\u0591-\u07FF\u200F\uFB1D-\uFDFF\uFE70-\uFEFC';
/**
* Nonprinting characters, including whitespace except the standard space.
* Used in escaping text for output.
* @type {string}
* @private
*/
bidichecker.utils.nonprintingChars_ = '\u0000-\u001f\u007f\u0085\u00a0\u1680' +
'\u180e\u2000-\u200f\u2028\u2029\u202f\u205f\u3000';
/**
* Regular expression to check for a substring of RTL characters with neutral
* characters among them (excluding Unicode bidi control characters).
* @type {RegExp}
* @private
*/
bidichecker.utils.rtlSubstringReg_ =
// Pattern starts and ends with an RTL character, with a sequence of
// RTL or neutral characters between them. The sequence following the
// first character is optional to allow for a match against a single RTL
// character.
new RegExp('[' + bidichecker.utils.rtlChars_ + '](?:[^' +
bidichecker.utils.ltrChars_ + '\u202A-\u202E]*[' +
bidichecker.utils.rtlChars_ + '])?', 'g');
/**
* Regular expression to check for a substring of LTR characters with neutral
* characters among them (excluding Unicode bidi control characters).
* @type {RegExp}
* @private
*/
bidichecker.utils.ltrSubstringReg_ =
// Pattern starts and ends with an LTR character, with a sequence of
// LTR or neutral characters between them. The sequence following the
// first character is optional to allow for a match against a single LTR
// character.
new RegExp('[' + bidichecker.utils.ltrChars_ + '](?:[^' +
bidichecker.utils.rtlChars_ + '\u202A-\u202E]*[' +
bidichecker.utils.ltrChars_ + '])?', 'g');
/**
* Regular expression to check for a prefix substring of neutral characters,
* possibly including whitespace. Used to find neutrals following opposite-
* directionality text.
* @type {RegExp}
* @private
*/
bidichecker.utils.initialNeutralSubstringReg_ =
new RegExp('^[' + bidichecker.utils.neutralChars_ + ']*[' +
bidichecker.utils.nonWhiteNeutralChars_ + ']');
/**
* Regular expression to check for a suffix substring of neutral characters,
* possibly including whitespace. Used to find neutrals preceding opposite-
* directionality text.
* @type {RegExp}
* @private
*/
bidichecker.utils.finalNeutralSubstringReg_ =
new RegExp('[' + bidichecker.utils.nonWhiteNeutralChars_ + '][' +
bidichecker.utils.neutralChars_ + ']*$');
/**
* Regular expression to match nonprinting chararacters.
* @type {RegExp}
* @private
*/
bidichecker.utils.nonprintingCharsReg_ =
new RegExp('[' + bidichecker.utils.nonprintingChars_ + ']');
/**
* Regular expression to check whether a string consists entirely of LRM
* marks and/or neutral characters.
* @type {RegExp}
* @private
*/
bidichecker.utils.lrmOrNeutralReg_ =
new RegExp('^[' + bidichecker.utils.neutralChars_ + '\u200E]+$');
/**
* Regular expression to check whether a string consists entirely of RLM
* marks and/or neutral characters.
* @type {RegExp}
* @private
*/
bidichecker.utils.rlmOrNeutralReg_ =
new RegExp('^[' + bidichecker.utils.neutralChars_ + '\u200F]+$');
/**
* Regular expression to check for a prefix substring of numbers, possibly mixed
* with neutral characters or whitespace. Used to identify spillover errors.
* @type {RegExp}
* @private
*/
bidichecker.utils.initialNumberSubstringReg_ =
new RegExp('^[' + bidichecker.utils.neutralChars_ + ']*\\d');
/**
* Regular expression to check for any strongly-directional character.
* @type {RegExp}
* @private
*/
bidichecker.utils.directionalCharReg_ =
new RegExp('[' + bidichecker.utils.ltrChars_ + bidichecker.utils.rtlChars_ +
']');
/**
* Class representing a substring and its index within some other string.
* @param {string} text The contents of the substring.
* @param {number} index Its 0-based string index within the original string.
* @constructor
*/
bidichecker.utils.Substring = function(text, index) {
/**
* The textual contents of the substring.
* @type {string}
*/
this.text = text;
/**
* The 0-based index of the substring's location in the original string.
* @type {number}
*/
this.index = index;
};
/**
* Finds substrings in a given string of RTL characters with neutral characters
* among them.
* @param {string} str The string to be searched.
* @return {Array.<bidichecker.utils.Substring>} Array of matching substrings.
* Returns empty array if no match.
*/
bidichecker.utils.findRtlSubstrings = function(str) {
var results = [];
var match;
while ((match = bidichecker.utils.rtlSubstringReg_.exec(str))) {
results.push(new bidichecker.utils.Substring(match[0], match.index));
}
return results;
};
/**
* Finds substrings in a given string of LTR characters with neutral characters
* among them. Ignores "fake RTL" strings, when the LTR characters are
* sandwiched between Unicode RLO and PDF characters.
* @param {string} str The string to be searched.
* @return {Array.<bidichecker.utils.Substring>} Array of matching substrings.
* Returns empty array if no match.
*/
bidichecker.utils.findLtrSubstrings = function(str) {
var results = [];
var match;
while ((match = bidichecker.utils.ltrSubstringReg_.exec(str))) {
// Skip fake RTl strings.
if (str.charAt(match.index - 1) == '\u202E' &&
str.charAt(match.index + match[0].length) == '\u202C') {
continue;
}
results.push(new bidichecker.utils.Substring(match[0], match.index));
}
return results;
};
/**
* Checks whether a string contains only LRM characters and/or neutrals.
* @param {string} str The string to be searched.
* @return {boolean} Does it have only LRMs and neutrals?
*/
bidichecker.utils.hasOnlyLrmChars = function(str) {
return bidichecker.utils.lrmOrNeutralReg_.exec(str) ? true : false;
};
/**
* Checks whether a string contains only RLM characters and/or neutrals.
* @param {string} str The string to be searched.
* @return {boolean} Does it have only RLMs and neutrals?
*/
bidichecker.utils.hasOnlyRlmChars = function(str) {
return bidichecker.utils.rlmOrNeutralReg_.exec(str) ? true : false;
};
/**
* Finds a prefixed substring of neutral characters starting at a given index in
* a string.
* @param {string} str The string to be searched.
* @param {number} index The index from which to begin searching.
* @return {bidichecker.utils.Substring} Matching substring or null if no match.
*/
bidichecker.utils.findNeutralTextAtIndex = function(str, index) {
var match = bidichecker.utils.initialNeutralSubstringReg_.exec(
str.substr(index));
if (match) {
return new bidichecker.utils.Substring(match[0], index);
}
return null;
};
/**
* Finds a substring of neutral characters ending before a given index in
* a string.
* @param {string} str The string to be searched.
* @param {number} index The index before which to begin searching.
* @return {bidichecker.utils.Substring} Matching substring or null if no match.
*/
bidichecker.utils.findNeutralTextBeforeIndex = function(str, index) {
var match = bidichecker.utils.finalNeutralSubstringReg_.exec(
str.substr(0, index));
if (match) {
return new bidichecker.utils.Substring(match[0], index - match[0].length);
}
return null;
};
/**
* Finds a prefixed substring of numbers, possibly mixed with neutral characters
* or whitespace, starting at the beginning of a string.
* @param {string} str The string to be searched.
* @return {bidichecker.utils.Substring} Matching substring or null if no match.
*/
bidichecker.utils.findNumberAtStart = function(str) {
var match = bidichecker.utils.initialNumberSubstringReg_.exec(str);
if (match) {
return new bidichecker.utils.Substring(match[0], 0);
}
return null;
};
/**
* Checks a string for any strongly-directional character (LTR or RTL).
* @param {string} str The string to be searched.
* @return {boolean} Whether a strongly-directional character is present.
*/
bidichecker.utils.hasDirectionalCharacter = function(str) {
return bidichecker.utils.directionalCharReg_.test(str);
};
/**
* Maps raw characters to their escaped versions, preseeded with the special
* cases. Used by bidichecker.utils.escapeChar.
* @private
* @type {Object}
*/
bidichecker.utils.escapedChars_ = {
'\b': '\\b',
'\f': '\\f',
'\n': '\\n',
'\r': '\\r',
'\t': '\\t',
'"': '\\"',
'\'': '\\\'',
'\\': '\\\\'
};
/**
* Takes a string and returns the equivalent escaped string.
* @param {string} str The string to escape.
* @return {string} An escaped string representing {@code str}.
*/
bidichecker.utils.escapeString = function(str) {
var sb = [];
for (var i = 0; i < str.length; i++) {
sb[i] = bidichecker.utils.escapeChar(str.charAt(i));
}
return sb.join('');
};
/**
* Takes a character and returns the escaped string for that character. For
* example escapeChar(String.fromCharCode(15)) -> "\\u000e".
* @param {string} c The character to escape.
* @return {string} An escaped string representing {@code c}.
*/
bidichecker.utils.escapeChar = function(c) {
if (c in bidichecker.utils.escapedChars_) {
return bidichecker.utils.escapedChars_[c];
}
var rv = c;
if (bidichecker.utils.nonprintingCharsReg_.test(c)) {
var cc = c.charCodeAt(0);
rv = '\\u';
if (cc < 4096) { // \u1000
rv += '0';
if (cc < 256) {
rv += '0';
if (cc < 16) {
rv += '0';
}
}
}
rv += cc.toString(16);
}
// Add it to the map for next time.
return bidichecker.utils.escapedChars_[c] = rv;
};
/**
* Surround a string with quotes, escaping the contents.
* @param {string} str The string to be quoted.
* @return {string} The original string surrounded by single quotes, and with
* any quote marks and unprintable characters escaped as hex codes.
*/
bidichecker.utils.singleQuoteString = function(str) {
return "'" + bidichecker.utils.escapeString(str) + "'";
};
/**
* Stringifies an attribute node as xyz='abc', where xyz is the attribute name
* and abc is its value. Escapes any special characters in the attribute value.
* @param {Node} node An attribute node.
* @return {string} The string description of the attribute.
* @private
*/
bidichecker.utils.describeAttribute_ = function(node) {
return node.nodeName + '=' +
bidichecker.utils.singleQuoteString(node.nodeValue);
};
/**
* Describes an element node as <div qrs='tuv' xyz='abc'>, where div is the
* node name and the remain fields are its attributes and their values. Escapes
* any special characters in the attribute values.
* @param {Node} node The node to describe.
* @return {string} The string form of the attribute.
*/
bidichecker.utils.describeNode = function(node) {
var attributes = [];
if (node.attributes) {
for (var i = 0; i < node.attributes.length; ++i) {
var attribute = node.attributes[i];
// In IE, node.attributes includes implicitly defined attributes.
if (attribute.specified === undefined || attribute.specified) {
attributes.push(attribute);
}
}
}
var descriptionFields = [node.nodeName.toLowerCase()];
var attributeDescriptions = goog.array.map(
attributes, bidichecker.utils.describeAttribute_);
attributeDescriptions.sort(); // Canonicalize order of attributes.
descriptionFields = descriptionFields.concat(attributeDescriptions);
return '<' + descriptionFields.join(' ') + '>';
};
/**
* Describes the location in the DOM of an element node. Walks up to the
* lowest-level parent node with an id attribute, or else continues all the way
* up the hierarchy.
* Output is a string of the form <div id='xyz'><p dir='rtl'>.
* @param {Node} node The node to describe.
* @return {string} The string description.
*/
bidichecker.utils.describeLocation = function(node) {
var descriptions = [];
for (var current = node; current; current = current.parentNode) {
descriptions.push(bidichecker.utils.describeNode(current));
// Found a uniquely identifiable element; stop here.
if (current.id || current.nodeName == 'BODY') {
break;
}
}
descriptions.reverse();
return descriptions.join('');
};
/**
* Collects all the text in the subtree below the given element in the DOM.
* @param {Node} node The node at the root of the subtree.
* @return {string} The text contents of the node.
*/
bidichecker.utils.getTextContents = function(node) {
// TODO(user): Translate non-text elements such as <IMG>, <BR>, etc., to
// some sort of visible text, such as a smiley for an image and a newline for
// BR.
var textPieces = [];
var iter = new goog.dom.TagIterator(node);
goog.iter.forEach(iter, function() {
if (iter.node.nodeType == goog.dom.NodeType.TEXT) {
textPieces.push(iter.node.data);
}
});
return textPieces.join('');
};
/**
* Names of all block-level tags.
* @type {Object}
* @private
*/
bidichecker.utils.BLOCK_TAG_NAMES_ = goog.object.createSet(
'ADDRESS', 'BLOCKQUOTE', 'BODY', 'CAPTION', 'CENTER', 'COL', 'COLGROUP',
'DIR', 'DIV', 'DL', 'DD', 'DT', 'FIELDSET', 'FORM', 'H1', 'H2', 'H3', 'H4',
'H5', 'H6', 'HR', 'ISINDEX', 'OL', 'LI', 'MAP', 'MENU', 'OPTGROUP',
'OPTION', 'P', 'PRE', 'TABLE', 'TBODY', 'TD', 'TFOOT', 'TH', 'THEAD', 'TR',
'TL', 'UL');
/**
* Checks if element is a block-level html element. The <tt>display</tt> css
* property is ignored.
* @param {Element} element The element to test.
* @return {boolean} Whether the element has a block-level tag.
* @private
*/
bidichecker.utils.isBlockTag_ = function(element) {
return !!bidichecker.utils.BLOCK_TAG_NAMES_[element.tagName];
};
/**
* Names of all CSS 'display' styles which are inlined.
* @type {Object}
* @private
*/
bidichecker.utils.INLINE_DISPLAY_TYPES_ = goog.object.createSet(
'inline', 'inline-block', 'inline-table');
/**
* The result of the most recent call to {@code
* bidichecker.utils.getDisplayStyle()}, cached to economize on calls to
* {@code goog.style.get*Style()}. It's commonly called more than once in
* succession for the same element.
* @type {Object}
* @private
*/
bidichecker.utils.lastDisplayStyle_ = { element: null, style: null };
/**
* Returns the display style of an element.
* @param {Element} element The element to test.
* @return {?string} The element's display style, if available.
*/
bidichecker.utils.getDisplayStyle = function(element) {
if (bidichecker.utils.lastDisplayStyle_.element == element) {
return bidichecker.utils.lastDisplayStyle_.style;
}
var style = goog.style.getCascadedStyle(element, 'display') ||
goog.style.getComputedStyle(element, 'display');
bidichecker.utils.lastDisplayStyle_ = { element: element, style: style };
return style;
};
/**
* Checks if an element is actually displayed as a block-level html element,
* considering both the default tag type and the <tt>display</tt> css property.
* @param {Element} element The element to test.
* @return {boolean} Whether the element is displayed as a block-level element.
*/
bidichecker.utils.isBlockElement = function(element) {
var display = bidichecker.utils.getDisplayStyle(element);
if (display) {
return !bidichecker.utils.INLINE_DISPLAY_TYPES_[display];
} else {
return bidichecker.utils.isBlockTag_(element);
}
};
/**
* Checks if a node is displayable; that is, it does not have style
* "display:none" and is not a script, noscript or style element.
* <p>In some browsers (such as IE), contents of noscript tags are not
* accessible anyway with scripting activated. In others (such as Chrome) the
* DOM contains a noscript element whose contents are the entity-encoded HTML
* source code of the contents of the noscript tag; if the context is RTL, that
* produces undeclared-directionality errors.
* @param {Node} node The node to test.
* @return {boolean} Whether the node is displayable.
*/
bidichecker.utils.isDisplayable = function(node) {
if (node.nodeType == goog.dom.NodeType.ELEMENT) {
if (node.nodeName == 'SCRIPT' || node.nodeName == 'STYLE' ||
node.nodeName == 'NOSCRIPT') {
return false;
}
return bidichecker.utils.getDisplayStyle(
/** @type {Element} */ (node)) != 'none';
}
return true;
};