blob: 0cb2c83a18f76deaf8b75ebe9c5000544b7708e1 [file] [log] [blame] [edit]
#--------------------------------------------------------------------------
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of mappings required to remove all
# diacritical marks from a unicode string. Each mapping is itself a list
# consisting of two elements - the unicode codepoint and the single ASCII
# character that it should be replaced with, or an empty string if the
# codepoint should simply be removed from the input. Examples:
#
# { 224 a } (replace codepoint 224 to "a")
# { 769 "" } (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
proc rd_load_unicodedata_text {zName} {
global tl_lookup_table
set fd [open $zName]
set lField {
code
character_name
general_category
canonical_combining_classes
bidirectional_category
character_decomposition_mapping
decimal_digit_value
digit_value
numeric_value
mirrored
unicode_1_name
iso10646_comment_field
uppercase_mapping
lowercase_mapping
titlecase_mapping
}
set lRet [list]
while { ![eof $fd] } {
set line [gets $fd]
if {$line == ""} continue
set fields [split $line ";"]
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
foreach $lField $fields {}
if { [llength $character_decomposition_mapping]!=2
|| [string is xdigit [lindex $character_decomposition_mapping 0]]==0
} {
continue
}
set iCode [expr "0x$code"]
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
if {[info exists tl_lookup_table($iCode)]} continue
if { ($iAscii >= 97 && $iAscii <= 122)
|| ($iAscii >= 65 && $iAscii <= 90)
} {
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
set dia($iDia) 1
}
}
foreach d [array names dia] {
lappend lRet [list $d ""]
}
set lRet [lsort -integer -index 0 $lRet]
close $fd
set lRet
}
#-------------------------------------------------------------------------
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of codepoints (integers). The list
# contains all codepoints in the UnicodeData.txt assigned to any "General
# Category" that is not a "Letter" or "Number".
#
proc an_load_unicodedata_text {zName} {
set fd [open $zName]
set lField {
code
character_name
general_category
canonical_combining_classes
bidirectional_category
character_decomposition_mapping
decimal_digit_value
digit_value
numeric_value
mirrored
unicode_1_name
iso10646_comment_field
uppercase_mapping
lowercase_mapping
titlecase_mapping
}
set lRet [list]
while { ![eof $fd] } {
set line [gets $fd]
if {$line == ""} continue
set fields [split $line ";"]
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
foreach $lField $fields {}
set iCode [expr "0x$code"]
set bAlnum [expr {
[lsearch {L N} [string range $general_category 0 0]] >= 0
|| $general_category=="Co"
}]
if { !$bAlnum } { lappend lRet $iCode }
}
close $fd
set lRet
}
proc tl_load_casefolding_txt {zName} {
global tl_lookup_table
set fd [open $zName]
while { ![eof $fd] } {
set line [gets $fd]
if {[string range $line 0 0] == "#"} continue
if {$line == ""} continue
foreach x {a b c d} {unset -nocomplain $x}
foreach {a b c d} [split $line ";"] {}
set a2 [list]
set c2 [list]
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
set b [string trim $b]
set d [string trim $d]
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
}
}