diff options
Diffstat (limited to 'src/backend/utils/mb/Unicode/convutils.pm')
-rw-r--r-- | src/backend/utils/mb/Unicode/convutils.pm | 240 |
1 files changed, 134 insertions, 106 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm index 42b4ffaaef..43cadf5303 100644 --- a/src/backend/utils/mb/Unicode/convutils.pm +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -9,15 +9,15 @@ use strict; use Exporter 'import'; -our @EXPORT = qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables); +our @EXPORT = + qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables); # Constants used in the 'direction' field of the character maps use constant { NONE => 0, TO_UNICODE => 1, FROM_UNICODE => 2, - BOTH => 3 -}; + BOTH => 3 }; ####################################################################### # read_source - common routine to read source file @@ -36,7 +36,7 @@ sub read_source next if (/^#/); chop; - next if (/^$/); # Ignore empty lines + next if (/^$/); # Ignore empty lines next if (/^0x([0-9A-F]+)\s+(#.*)$/); @@ -49,13 +49,13 @@ sub read_source print STDERR "READ ERROR at line $. in $fname: $_\n"; exit; } - my $out = {code => hex($1), - ucs => hex($2), - comment => $4, - direction => BOTH, - f => $fname, - l => $. - }; + my $out = { + code => hex($1), + ucs => hex($2), + comment => $4, + direction => BOTH, + f => $fname, + l => $. }; # Ignore pure ASCII mappings. PostgreSQL character conversion code # never even passes these to the conversion code. @@ -92,8 +92,10 @@ sub print_conversion_tables { my ($this_script, $csname, $charset) = @_; - print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, $charset); - print_conversion_tables_direction($this_script, $csname, TO_UNICODE, $charset); + print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, + $charset); + print_conversion_tables_direction($this_script, $csname, TO_UNICODE, + $charset); } ############################################################################# @@ -117,14 +119,14 @@ sub print_conversion_tables_direction my $tblname; if ($direction == TO_UNICODE) { - $fname = lc("${csname}_to_utf8.map"); + $fname = lc("${csname}_to_utf8.map"); $tblname = lc("${csname}_to_unicode_tree"); print "- Writing ${csname}=>UTF8 conversion table: $fname\n"; } else { - $fname = lc("utf8_to_${csname}.map"); + $fname = lc("utf8_to_${csname}.map"); $tblname = lc("${csname}_from_unicode_tree"); print "- Writing UTF8=>${csname} conversion table: $fname\n"; @@ -135,24 +137,22 @@ sub print_conversion_tables_direction print $out "/* src/backend/utils/mb/Unicode/$fname */\n"; print $out "/* This file is generated by $this_script */\n\n"; - # Collect regular, non-combined, mappings, and create the radix tree from them. +# Collect regular, non-combined, mappings, and create the radix tree from them. my $charmap = &make_charmap($out, $charset, $direction, 0); print_radix_table($out, $tblname, $charmap); - # Collect combined characters, and create combined character table (if any) + # Collect combined characters, and create combined character table (if any) my $charmap_combined = &make_charmap_combined($charset, $direction); if (scalar @{$charmap_combined} > 0) { if ($direction == TO_UNICODE) { - print_to_utf8_combined_map($out, $csname, - $charmap_combined, 1); + print_to_utf8_combined_map($out, $csname, $charmap_combined, 1); } else { - print_from_utf8_combined_map($out, $csname, - $charmap_combined, 1); + print_from_utf8_combined_map($out, $csname, $charmap_combined, 1); } } @@ -166,14 +166,16 @@ sub print_from_utf8_combined_map my $last_comment = ""; printf $out "\n/* Combined character map */\n"; - printf $out "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", + printf $out +"static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", scalar(@$table); my $first = 1; - foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table) - { + foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table) + { print($out ",") if (!$first); $first = 0; - print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); + print $out "\t/* $last_comment */" + if ($verbose && $last_comment ne ""); printf $out "\n {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code}; @@ -198,15 +200,17 @@ sub print_to_utf8_combined_map my $last_comment = ""; printf $out "\n/* Combined character map */\n"; - printf $out "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", + printf $out +"static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", scalar(@$table); my $first = 1; - foreach my $i (sort {$a->{code} <=> $b->{code}} @$table) - { + foreach my $i (sort { $a->{code} <=> $b->{code} } @$table) + { print($out ",") if (!$first); $first = 0; - print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); + print $out "\t/* $last_comment */" + if ($verbose && $last_comment ne ""); printf $out "\n {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second}; @@ -214,7 +218,7 @@ sub print_to_utf8_combined_map if ($verbose >= 2) { $last_comment = - sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } elsif ($verbose >= 1) { @@ -255,25 +259,25 @@ sub print_radix_table } elsif ($in < 0x10000) { - my $b1 = $in >> 8; - my $b2 = $in & 0xff; + my $b1 = $in >> 8; + my $b2 = $in & 0xff; $b2map{$b1}{$b2} = $out; } elsif ($in < 0x1000000) { - my $b1 = $in >> 16; - my $b2 = ($in >> 8) & 0xff; - my $b3 = $in & 0xff; + my $b1 = $in >> 16; + my $b2 = ($in >> 8) & 0xff; + my $b3 = $in & 0xff; $b3map{$b1}{$b2}{$b3} = $out; } elsif ($in < 0x100000000) { - my $b1 = $in >> 24; - my $b2 = ($in >> 16) & 0xff; - my $b3 = ($in >> 8) & 0xff; - my $b4 = $in & 0xff; + my $b1 = $in >> 24; + my $b2 = ($in >> 16) & 0xff; + my $b3 = ($in >> 8) & 0xff; + my $b4 = $in & 0xff; $b4map{$b1}{$b2}{$b3}{$b4} = $out; } @@ -309,10 +313,14 @@ sub print_radix_table ### # Add the segments for the radix trees themselves. - push @segments, build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map); - push @segments, build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map); - push @segments, build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map); - push @segments, build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map); + push @segments, + build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map); + push @segments, + build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map); + push @segments, + build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map); + push @segments, + build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map); ### ### Find min and max index used in each level of each tree. @@ -325,23 +333,24 @@ sub print_radix_table my %max_idx; foreach my $seg (@segments) { - my $this_min = $min_idx{$seg->{depth}}->{$seg->{level}}; - my $this_max = $max_idx{$seg->{depth}}->{$seg->{level}}; + my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} }; + my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} }; - foreach my $i (keys %{$seg->{values}}) + foreach my $i (keys %{ $seg->{values} }) { $this_min = $i if (!defined $this_min || $i < $this_min); $this_max = $i if (!defined $this_max || $i > $this_max); } - $min_idx{$seg->{depth}}{$seg->{level}} = $this_min; - $max_idx{$seg->{depth}}{$seg->{level}} = $this_max; + $min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min; + $max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max; } + # Copy the mins and max's back to every segment, for convenience. foreach my $seg (@segments) { - $seg->{min_idx} = $min_idx{$seg->{depth}}{$seg->{level}}; - $seg->{max_idx} = $max_idx{$seg->{depth}}{$seg->{level}}; + $seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} }; + $seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} }; } ### @@ -359,11 +368,10 @@ sub print_radix_table $widest_range = $this_range if ($this_range > $widest_range); } - unshift @segments, { - header => "Dummy map, for invalid values", + unshift @segments, + { header => "Dummy map, for invalid values", min_idx => 0, - max_idx => $widest_range - }; + max_idx => $widest_range }; ### ### Eliminate overlapping zeros @@ -378,26 +386,34 @@ sub print_radix_table ### for (my $j = 0; $j < $#segments - 1; $j++) { - my $seg = $segments[$j]; - my $nextseg = $segments[$j + 1]; + my $seg = $segments[$j]; + my $nextseg = $segments[ $j + 1 ]; # Count the number of zero values at the end of this segment. my $this_trail_zeros = 0; - for (my $i = $seg->{max_idx}; $i >= $seg->{min_idx} && !$seg->{values}->{$i}; $i--) + for ( + my $i = $seg->{max_idx}; + $i >= $seg->{min_idx} && !$seg->{values}->{$i}; + $i--) { $this_trail_zeros++; } # Count the number of zeros at the beginning of next segment. my $next_lead_zeros = 0; - for (my $i = $nextseg->{min_idx}; $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; $i++) + for ( + my $i = $nextseg->{min_idx}; + $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; + $i++) { $next_lead_zeros++; } # How many zeros in common? my $overlaid_trail_zeros = - ($this_trail_zeros > $next_lead_zeros) ? $next_lead_zeros : $this_trail_zeros; + ($this_trail_zeros > $next_lead_zeros) + ? $next_lead_zeros + : $this_trail_zeros; $seg->{overlaid_trail_zeros} = $overlaid_trail_zeros; $seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros; @@ -419,7 +435,7 @@ sub print_radix_table foreach my $seg (@segments) { $seg->{offset} = $flatoff; - $segmap{$seg->{label}} = $flatoff; + $segmap{ $seg->{label} } = $flatoff; $flatoff += $seg->{max_idx} - $seg->{min_idx} + 1; } my $tblsize = $flatoff; @@ -427,9 +443,9 @@ sub print_radix_table # Second pass: look up the offset of each label reference in the hash. foreach my $seg (@segments) { - while (my ($i, $val) = each %{$seg->{values}}) + while (my ($i, $val) = each %{ $seg->{values} }) { - if (!($val =~ /^[0-9,.E]+$/ )) + if (!($val =~ /^[0-9,.E]+$/)) { my $segoff = $segmap{$val}; if ($segoff) @@ -482,7 +498,7 @@ sub print_radix_table my $max_val = 0; foreach my $seg (@segments) { - foreach my $val (values %{$seg->{values}}) + foreach my $val (values %{ $seg->{values} }) { $max_val = $val if ($val > $max_val); } @@ -498,17 +514,17 @@ sub print_radix_table if ($max_val <= 0xffff) { $vals_per_line = 8; - $colwidth = 4; + $colwidth = 4; } elsif ($max_val <= 0xffffff) { $vals_per_line = 4; - $colwidth = 6; + $colwidth = 6; } else { $vals_per_line = 4; - $colwidth = 8; + $colwidth = 8; } ### @@ -529,17 +545,20 @@ sub print_radix_table print $out " ${tblname}_table,\n"; } printf $out "\n"; - printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", $b1root; + printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", + $b1root; printf $out " 0x%02x, /* b1_lower */\n", $b1_lower; printf $out " 0x%02x, /* b1_upper */\n", $b1_upper; printf $out "\n"; - printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", $b2root; + printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", + $b2root; printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower; printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper; printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower; printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper; printf $out "\n"; - printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b3root; + printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", + $b3root; printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower; printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper; printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower; @@ -547,7 +566,8 @@ sub print_radix_table printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower; printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper; printf $out "\n"; - printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b4root; + printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", + $b4root; printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower; printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper; printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower; @@ -561,18 +581,21 @@ sub print_radix_table print $out "static const $datatype ${tblname}_table[$tblsize] =\n"; print $out "{"; my $off = 0; + foreach my $seg (@segments) { printf $out "\n"; printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off; printf $out "\n"; - for (my $i=$seg->{min_idx}; $i <= $seg->{max_idx};) + for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};) { + # Print the next line's worth of values. # XXX pad to begin at a nice boundary printf $out " /* %02x */ ", $i; - for (my $j = 0; $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) + for (my $j = 0; + $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) { my $val = $seg->{values}->{$i}; @@ -588,7 +611,8 @@ sub print_radix_table } if ($seg->{overlaid_trail_zeros}) { - printf $out " /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n"; + printf $out +" /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n"; } } @@ -607,13 +631,14 @@ sub build_segments_from_tree if (%{$map}) { - @segments = build_segments_recurse($header, $rootlabel, "", 1, $depth, $map); + @segments = + build_segments_recurse($header, $rootlabel, "", 1, $depth, $map); # Sort the segments into "breadth-first" order. Not strictly required, # but makes the maps nicer to read. - @segments = sort { $a->{level} cmp $b->{level} or - $a->{path} cmp $b->{path}} - @segments; + @segments = + sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} } + @segments; } return @segments; @@ -628,14 +653,13 @@ sub build_segments_recurse if ($level == $depth) { - push @segments, { - header => $header . ", leaf: ${path}xx", - label => $label, - level => $level, - depth => $depth, - path => $path, - values => $map - }; + push @segments, + { header => $header . ", leaf: ${path}xx", + label => $label, + level => $level, + depth => $depth, + path => $path, + values => $map }; } else { @@ -646,19 +670,19 @@ sub build_segments_recurse my $childpath = $path . sprintf("%02x", $i); my $childlabel = "$depth-level-$level-$childpath"; - push @segments, build_segments_recurse($header, $childlabel, $childpath, - $level + 1, $depth, $val); + push @segments, + build_segments_recurse($header, $childlabel, $childpath, + $level + 1, $depth, $val); $children{$i} = $childlabel; } - push @segments, { - header => $header . ", byte #$level: ${path}xx", - label => $label, - level => $level, - depth => $depth, - path => $path, - values => \%children - }; + push @segments, + { header => $header . ", byte #$level: ${path}xx", + label => $label, + level => $level, + depth => $depth, + path => $path, + values => \%children }; } return @segments; } @@ -688,29 +712,31 @@ sub make_charmap my %charmap; foreach my $c (@$charset) { + # combined characters are handled elsewhere next if (defined $c->{ucs_second}); next if ($c->{direction} != $direction && $c->{direction} != BOTH); my ($src, $dst) = - $direction == TO_UNICODE - ? ($c->{code}, ucs2utf($c->{ucs})) - : (ucs2utf($c->{ucs}), $c->{code}); + $direction == TO_UNICODE + ? ($c->{code}, ucs2utf($c->{ucs})) + : (ucs2utf($c->{ucs}), $c->{code}); # check for duplicate source codes if (defined $charmap{$src}) { printf STDERR - "Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n", - $c->{f}, $c->{l}, $src, $charmap{$src}, $dst; +"Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n", + $c->{f}, $c->{l}, $src, $charmap{$src}, $dst; exit; } $charmap{$src} = $dst; if ($verbose) { - printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, $c->{l}, $c->{comment}; + printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, + $c->{l}, $c->{comment}; } } if ($verbose) @@ -743,11 +769,13 @@ sub make_charmap_combined if (defined $c->{ucs_second}) { - my $entry = {utf8 => ucs2utf($c->{ucs}), - utf8_second => ucs2utf($c->{ucs_second}), - code => $c->{code}, - comment => $c->{comment}, - f => $c->{f}, l => $c->{l}}; + my $entry = { + utf8 => ucs2utf($c->{ucs}), + utf8_second => ucs2utf($c->{ucs_second}), + code => $c->{code}, + comment => $c->{comment}, + f => $c->{f}, + l => $c->{l} }; push @combined, $entry; } } |