blob: 7cbf18bd12c19f86b802a58a6b4b287db0ac7e7a [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* This class generates unicode patterns (of the form \u0000-\u0005\u0009\u000c-\u0010) for a
* couple of character matching routines used in boilerpipe (e.g. unicode character classes, etc.).
*
* It only supports Unicode's Basic Multilingual Plane (i.e. code points \u0000 to \uFFFF).
*/
class UnicodePatternGenerator {
public static void main(String[] args) {
String range = createRange(PAT_VALID_WORD_CHARACTER);
String verify = createRange(Pattern.compile("[" + range + "]"));
if (range.equals(verify)) {
System.out.println("SUCCESS");
System.out.println(range);
} else {
System.out.println("FAILURE");
}
String spaceRange = createRange(new Checker() {
public boolean check(char c) {
return Character.isWhitespace(c);
}
});
String verifySpaceRange = createRange(Pattern.compile("[" + spaceRange + "]"));
if (spaceRange.equals(verifySpaceRange)) {
System.out.println("SUCCESS");
System.out.println(spaceRange);
} else {
System.out.println("FAILURE");
}
}
private static interface Checker {
boolean check(char s);
}
private static class PatternChecker implements Checker {
PatternChecker(Pattern p) { this.p = p; }
public boolean check(char c) {
return p.matcher(Character.toString(c)).find();
}
Pattern p;
}
private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern
.compile("[\\p{L}\\p{Nd}\\p{Nl}\\p{No}]");
public static String toCodePoint(Integer i) {
return String.format("\\u%04x", i);
}
public static String createRange(Pattern p) {
return createRange(new PatternChecker(p));
}
public static String createRange(Checker c) {
String range = "";
int start = -1;
for (int i = 0; i < (1 << 16); ++i) {
if (!c.check((char)i)) {
if (start >= 0) {
range += toCodePoint(start);
if (start != i - 1) {
range += "-";
range += toCodePoint(i - 1);
}
}
start = -1;
} else if (start == -1) {
start = i;
}
}
return range;
}
}