sqlite/sqlite-src-3280000/ext/fts3/unicode/parseunicode.tcl - chromium/src/third_party - Git at Google


 #--------------------------------------------------------------------------
 # Parameter $zName must be a path to the file UnicodeData.txt. This command
 # reads the file and returns a list of mappings required to remove all
 # diacritical marks from a unicode string. Each mapping is itself a list
 # consisting of two elements - the unicode codepoint and the single ASCII
 # character that it should be replaced with, or an empty string if the
 # codepoint should simply be removed from the input. Examples:
 #
 #   { 224 a  0 }     (replace codepoint 224 to "a")
 #   { 769 "" 0 }     (remove codepoint 769 from input)
 #
 # Mappings are only returned for non-upper case codepoints. It is assumed
 # that the input has already been folded to lower case.
 #
 # The third value in the list is always either 0 or 1. 0 if the
 # UnicodeData.txt file maps the codepoint to a single ASCII character and
 # a diacritic, or 1 if the mapping is indirect. For example, consider the
 # two entries:
 #
 # 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
 # 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
 #
 # The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
 # diacritic). The second is an indirect mapping, as it maps to the
 # first codepoint plus 0302 (a diacritic).
 #
 proc rd_load_unicodedata_text {zName} {
   global tl_lookup_table

   set fd [open $zName]
   set lField {
     code
     character_name
     general_category
     canonical_combining_classes
     bidirectional_category
     character_decomposition_mapping
     decimal_digit_value
     digit_value
     numeric_value
     mirrored
     unicode_1_name
     iso10646_comment_field
     uppercase_mapping
     lowercase_mapping
     titlecase_mapping
   }
   set lRet [list]

   while { ![eof $fd] } {
     set line [gets $fd]
     if {$line == ""} continue

     set fields [split $line ";"]
     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
     foreach $lField $fields {}
     if { [llength $character_decomposition_mapping]!=2
       || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
     } {
       continue
     }

     set iCode  [expr "0x$code"]
     set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
     set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]

     # Filter out upper-case characters, as they will be mapped to their
     # lower-case equivalents before this data is used.
     if {[info exists tl_lookup_table($iCode)]} continue

     # Check if this is an indirect mapping. If so, set bIndirect to true
     # and change $iAscii to the indirectly mappped ASCII character.
     set bIndirect 0
     if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
       set iAscii $mapping($iAscii)
       set bIndirect 1
     }

     if { ($iAscii >= 97 && $iAscii <= 122)
       || ($iAscii >= 65 && $iAscii <= 90)
     } {
       lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
       set mapping($iCode) $iAscii
       set dia($iDia) 1
     }
   }

   foreach d [array names dia] {
     lappend lRet [list $d "" 0]
   }
   set lRet [lsort -integer -index 0 $lRet]

   close $fd
   set lRet
 }

 #-------------------------------------------------------------------------
 # Parameter $zName must be a path to the file UnicodeData.txt. This command
 # reads the file and returns a list of codepoints (integers). The list
 # contains all codepoints in the UnicodeData.txt assigned to any "General
 # Category" that is not a "Letter" or "Number".
 #
 proc an_load_unicodedata_text {zName} {
   set fd [open $zName]
   set lField {
     code
     character_name
     general_category
     canonical_combining_classes
     bidirectional_category
     character_decomposition_mapping
     decimal_digit_value
     digit_value
     numeric_value
     mirrored
     unicode_1_name
     iso10646_comment_field
     uppercase_mapping
     lowercase_mapping
     titlecase_mapping
   }
   set lRet [list]

   while { ![eof $fd] } {
     set line [gets $fd]
     if {$line == ""} continue

     set fields [split $line ";"]
     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
     foreach $lField $fields {}

     set iCode [expr "0x$code"]
     set bAlnum [expr {
          [lsearch {L N} [string range $general_category 0 0]] >= 0
       || $general_category=="Co"
     }]

     if { !$bAlnum } { lappend lRet $iCode }
   }

   close $fd
   set lRet
 }

 proc tl_load_casefolding_txt {zName} {
   global tl_lookup_table

   set fd [open $zName]
   while { ![eof $fd] } {
     set line [gets $fd]
     if {[string range $line 0 0] == "#"} continue
     if {$line == ""} continue

     foreach x {a b c d} {unset -nocomplain $x}
     foreach {a b c d} [split $line ";"] {}

     set a2 [list]
     set c2 [list]
     foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
     foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
     set b [string trim $b]
     set d [string trim $d]

     if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
   }
 }

 proc cc_load_unicodedata_text {zName} {
   set fd [open $zName]
   set lField {
     code
     character_name
     general_category
     canonical_combining_classes
     bidirectional_category
     character_decomposition_mapping
     decimal_digit_value
     digit_value
     numeric_value
     mirrored
     unicode_1_name
     iso10646_comment_field
     uppercase_mapping
     lowercase_mapping
     titlecase_mapping
   }
   set lRet [list]

   while { ![eof $fd] } {
     set line [gets $fd]
     if {$line == ""} continue

     set fields [split $line ";"]
     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
     foreach $lField $fields {}

     lappend lRet [list $code $general_category]
   }

   close $fd
   set lRet
 }

	#--------------------------------------------------------------------------
	# Parameter $zName must be a path to the file UnicodeData.txt. This command
	# reads the file and returns a list of mappings required to remove all
	# diacritical marks from a unicode string. Each mapping is itself a list
	# consisting of two elements - the unicode codepoint and the single ASCII
	# character that it should be replaced with, or an empty string if the
	# codepoint should simply be removed from the input. Examples:
	#
	# { 224 a 0 } (replace codepoint 224 to "a")
	# { 769 "" 0 } (remove codepoint 769 from input)
	#
	# Mappings are only returned for non-upper case codepoints. It is assumed
	# that the input has already been folded to lower case.
	#
	# The third value in the list is always either 0 or 1. 0 if the
	# UnicodeData.txt file maps the codepoint to a single ASCII character and
	# a diacritic, or 1 if the mapping is indirect. For example, consider the
	# two entries:
	#
	# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
	# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
	#
	# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
	# diacritic). The second is an indirect mapping, as it maps to the
	# first codepoint plus 0302 (a diacritic).
	#
	proc rd_load_unicodedata_text {zName} {
	global tl_lookup_table

	set fd [open $zName]
	set lField {
	code
	character_name
	general_category
	canonical_combining_classes
	bidirectional_category
	character_decomposition_mapping
	decimal_digit_value
	digit_value
	numeric_value
	mirrored
	unicode_1_name
	iso10646_comment_field
	uppercase_mapping
	lowercase_mapping
	titlecase_mapping
	}
	set lRet [list]

	while { ![eof $fd] } {
	set line [gets $fd]
	if {$line == ""} continue

	set fields [split $line ";"]
	if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
	foreach $lField $fields {}
	if { [llength $character_decomposition_mapping]!=2
	\|\| [string is xdigit [lindex $character_decomposition_mapping 0]]==0
	} {
	continue
	}

	set iCode [expr "0x$code"]
	set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
	set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]

	# Filter out upper-case characters, as they will be mapped to their
	# lower-case equivalents before this data is used.
	if {[info exists tl_lookup_table($iCode)]} continue

	# Check if this is an indirect mapping. If so, set bIndirect to true
	# and change $iAscii to the indirectly mappped ASCII character.
	set bIndirect 0
	if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
	set iAscii $mapping($iAscii)
	set bIndirect 1
	}

	if { ($iAscii >= 97 && $iAscii <= 122)
	\|\| ($iAscii >= 65 && $iAscii <= 90)
	} {
	lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
	set mapping($iCode) $iAscii
	set dia($iDia) 1
	}
	}

	foreach d [array names dia] {
	lappend lRet [list $d "" 0]
	}
	set lRet [lsort -integer -index 0 $lRet]

	close $fd
	set lRet
	}

	#-------------------------------------------------------------------------
	# Parameter $zName must be a path to the file UnicodeData.txt. This command
	# reads the file and returns a list of codepoints (integers). The list
	# contains all codepoints in the UnicodeData.txt assigned to any "General
	# Category" that is not a "Letter" or "Number".
	#
	proc an_load_unicodedata_text {zName} {
	set fd [open $zName]
	set lField {
	code
	character_name
	general_category
	canonical_combining_classes
	bidirectional_category
	character_decomposition_mapping
	decimal_digit_value
	digit_value
	numeric_value
	mirrored
	unicode_1_name
	iso10646_comment_field
	uppercase_mapping
	lowercase_mapping
	titlecase_mapping
	}
	set lRet [list]

	while { ![eof $fd] } {
	set line [gets $fd]
	if {$line == ""} continue

	set fields [split $line ";"]
	if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
	foreach $lField $fields {}

	set iCode [expr "0x$code"]
	set bAlnum [expr {
	[lsearch {L N} [string range $general_category 0 0]] >= 0
	\|\| $general_category=="Co"
	}]

	if { !$bAlnum } { lappend lRet $iCode }
	}

	close $fd
	set lRet
	}

	proc tl_load_casefolding_txt {zName} {
	global tl_lookup_table

	set fd [open $zName]
	while { ![eof $fd] } {
	set line [gets $fd]
	if {[string range $line 0 0] == "#"} continue
	if {$line == ""} continue

	foreach x {a b c d} {unset -nocomplain $x}
	foreach {a b c d} [split $line ";"] {}

	set a2 [list]
	set c2 [list]
	foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
	foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
	set b [string trim $b]
	set d [string trim $d]

	if {$b=="C" \|\| $b=="S"} { set tl_lookup_table($a2) $c2 }
	}
	}

	proc cc_load_unicodedata_text {zName} {
	set fd [open $zName]
	set lField {
	code
	character_name
	general_category
	canonical_combining_classes
	bidirectional_category
	character_decomposition_mapping
	decimal_digit_value
	digit_value
	numeric_value
	mirrored
	unicode_1_name
	iso10646_comment_field
	uppercase_mapping
	lowercase_mapping
	titlecase_mapping
	}
	set lRet [list]

	while { ![eof $fd] } {
	set line [gets $fd]
	if {$line == ""} continue

	set fields [split $line ";"]
	if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
	foreach $lField $fields {}

	lappend lRet [list $code $general_category]
	}

	close $fd
	set lRet
	}