blob: 960e3321947d74a94d7e7ab6b511a2e5b7c14e2c [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package org.chromium.distiller;
public class StringUtil {
// For the whitespace-related functions below, Java's and Javascript's versions of '\s' and '\S'
// are different. E.g. java doesn't recognize   in a text node as whitespace but
// javascript does. The former causes GWT tests to fail; the latter is what we want.
// Don't use the "g" global search flag, or subsequent searches, even with different Character
// or String, become unpredictable.
public static native boolean isWhitespace(Character c) /*-{
return /\s/.test(c);
public static native boolean isStringAllWhitespace(String s) /*-{
return !/\S/.test(s);
public static native String jsTrim(String s) /*-{
return s.trim();
public static String[] split(String input, String regex) {
// TODO(cjhopman): investigate using native String.split()
return input.split(regex);
public static native String[] jsSplit(String input, String sep) /*-{
return input.split(sep);
public static native String join(String[] strings, String sep) /*-{
return strings.join(sep);
public static int splitLength(String input, String regex) {
return StringUtil.split(input, regex).length;
public static boolean match(String input, String regex) {
return RegExp.compile(regex, "i").test(input);
public static String findAndReplace(String input, String regex, String replace) {
return RegExp.compile(regex, "gi").replace(input, replace);
* For some languages, counting the number of words relies on non-trivial word
* segmentation algorithms, or even huge look-up tables. This function needs to
* be reasonably fast, so the word count for some languages would only be an
* approximation.
* Read for more info.
public static interface WordCounter {
public int count(String s);
public static class FullWordCounter implements WordCounter {
public native int count(String s) /*-{
// The following range includes broader alphabetical letters and Hangul Syllables.
var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
var c = (m ? m.length : 0);
// The following range includes Hiragana, Katakana, and CJK Unified Ideographs.
// Hangul Syllables are not included.
m = s.match(/([\u3040-\uA4CF])/g);
c += Math.ceil((m ? m.length : 0) * 0.55);
return c;
public static class LetterWordCounter implements WordCounter {
public native int count(String s) /*-{
// The following range includes broader alphabetical letters and Hangul Syllables.
var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
return (m ? m.length : 0);
public static class FastWordCounter implements WordCounter {
public native int count(String s) /*-{
// The following range includes broader alphabetical letters.
var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g);
return (m ? m.length : 0);
public static void setWordCounter(String text) {
sWordCounter = selectWordCounter(text);
public static WordCounter selectWordCounter(String text) {
final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g");
final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g");
if (rFull.test(text)) {
return new FullWordCounter();
} else if (rLetter.test(text)) {
return new LetterWordCounter();
} else {
return new FastWordCounter();
// Use the safest version of WordCounter as the default.
static WordCounter sWordCounter = new FullWordCounter();
public static int countWords(String s) {
return sWordCounter.count(s);
public static native String regexEscape(String s) /*-{
return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
* Returns true if character is a digit.
public static native boolean isDigit(Character c) /*-{
return /\d/.test(c);
* Returns true if the entire string contains only digits.
public static native boolean isStringAllDigits(String s) /*-{
return /^\d+$/.test(s);
* Returns true if string contains at least 1 digit.
public static native boolean containsDigit(String s) /*-{
return /\d/.test(s);
* Returns the plain number if given string can be converted to one >= 0.
* Returns -1 if string is empty or not all digits.
public static int toNumber(String s) {
if (s.isEmpty() || !StringUtil.isStringAllDigits(s)) return -1;
return JavaScript.parseInt(s, 10);