summaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/Unicode/convutils.pm
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/mb/Unicode/convutils.pm')
-rw-r--r--src/backend/utils/mb/Unicode/convutils.pm240
1 files changed, 134 insertions, 106 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
index 42b4ffaaef..43cadf5303 100644
--- a/src/backend/utils/mb/Unicode/convutils.pm
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -9,15 +9,15 @@ use strict;
use Exporter 'import';
-our @EXPORT = qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);
+our @EXPORT =
+ qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);
# Constants used in the 'direction' field of the character maps
use constant {
NONE => 0,
TO_UNICODE => 1,
FROM_UNICODE => 2,
- BOTH => 3
-};
+ BOTH => 3 };
#######################################################################
# read_source - common routine to read source file
@@ -36,7 +36,7 @@ sub read_source
next if (/^#/);
chop;
- next if (/^$/); # Ignore empty lines
+ next if (/^$/); # Ignore empty lines
next if (/^0x([0-9A-F]+)\s+(#.*)$/);
@@ -49,13 +49,13 @@ sub read_source
print STDERR "READ ERROR at line $. in $fname: $_\n";
exit;
}
- my $out = {code => hex($1),
- ucs => hex($2),
- comment => $4,
- direction => BOTH,
- f => $fname,
- l => $.
- };
+ my $out = {
+ code => hex($1),
+ ucs => hex($2),
+ comment => $4,
+ direction => BOTH,
+ f => $fname,
+ l => $. };
# Ignore pure ASCII mappings. PostgreSQL character conversion code
# never even passes these to the conversion code.
@@ -92,8 +92,10 @@ sub print_conversion_tables
{
my ($this_script, $csname, $charset) = @_;
- print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, $charset);
- print_conversion_tables_direction($this_script, $csname, TO_UNICODE, $charset);
+ print_conversion_tables_direction($this_script, $csname, FROM_UNICODE,
+ $charset);
+ print_conversion_tables_direction($this_script, $csname, TO_UNICODE,
+ $charset);
}
#############################################################################
@@ -117,14 +119,14 @@ sub print_conversion_tables_direction
my $tblname;
if ($direction == TO_UNICODE)
{
- $fname = lc("${csname}_to_utf8.map");
+ $fname = lc("${csname}_to_utf8.map");
$tblname = lc("${csname}_to_unicode_tree");
print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
}
else
{
- $fname = lc("utf8_to_${csname}.map");
+ $fname = lc("utf8_to_${csname}.map");
$tblname = lc("${csname}_from_unicode_tree");
print "- Writing UTF8=>${csname} conversion table: $fname\n";
@@ -135,24 +137,22 @@ sub print_conversion_tables_direction
print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
print $out "/* This file is generated by $this_script */\n\n";
- # Collect regular, non-combined, mappings, and create the radix tree from them.
+# Collect regular, non-combined, mappings, and create the radix tree from them.
my $charmap = &make_charmap($out, $charset, $direction, 0);
print_radix_table($out, $tblname, $charmap);
- # Collect combined characters, and create combined character table (if any)
+ # Collect combined characters, and create combined character table (if any)
my $charmap_combined = &make_charmap_combined($charset, $direction);
if (scalar @{$charmap_combined} > 0)
{
if ($direction == TO_UNICODE)
{
- print_to_utf8_combined_map($out, $csname,
- $charmap_combined, 1);
+ print_to_utf8_combined_map($out, $csname, $charmap_combined, 1);
}
else
{
- print_from_utf8_combined_map($out, $csname,
- $charmap_combined, 1);
+ print_from_utf8_combined_map($out, $csname, $charmap_combined, 1);
}
}
@@ -166,14 +166,16 @@ sub print_from_utf8_combined_map
my $last_comment = "";
printf $out "\n/* Combined character map */\n";
- printf $out "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
+ printf $out
+"static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
scalar(@$table);
my $first = 1;
- foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
- {
+ foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table)
+ {
print($out ",") if (!$first);
$first = 0;
- print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
+ print $out "\t/* $last_comment */"
+ if ($verbose && $last_comment ne "");
printf $out "\n {0x%08x, 0x%08x, 0x%04x}",
$i->{utf8}, $i->{utf8_second}, $i->{code};
@@ -198,15 +200,17 @@ sub print_to_utf8_combined_map
my $last_comment = "";
printf $out "\n/* Combined character map */\n";
- printf $out "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
+ printf $out
+"static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
scalar(@$table);
my $first = 1;
- foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
- {
+ foreach my $i (sort { $a->{code} <=> $b->{code} } @$table)
+ {
print($out ",") if (!$first);
$first = 0;
- print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
+ print $out "\t/* $last_comment */"
+ if ($verbose && $last_comment ne "");
printf $out "\n {0x%04x, 0x%08x, 0x%08x}",
$i->{code}, $i->{utf8}, $i->{utf8_second};
@@ -214,7 +218,7 @@ sub print_to_utf8_combined_map
if ($verbose >= 2)
{
$last_comment =
- sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
+ sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
}
elsif ($verbose >= 1)
{
@@ -255,25 +259,25 @@ sub print_radix_table
}
elsif ($in < 0x10000)
{
- my $b1 = $in >> 8;
- my $b2 = $in & 0xff;
+ my $b1 = $in >> 8;
+ my $b2 = $in & 0xff;
$b2map{$b1}{$b2} = $out;
}
elsif ($in < 0x1000000)
{
- my $b1 = $in >> 16;
- my $b2 = ($in >> 8) & 0xff;
- my $b3 = $in & 0xff;
+ my $b1 = $in >> 16;
+ my $b2 = ($in >> 8) & 0xff;
+ my $b3 = $in & 0xff;
$b3map{$b1}{$b2}{$b3} = $out;
}
elsif ($in < 0x100000000)
{
- my $b1 = $in >> 24;
- my $b2 = ($in >> 16) & 0xff;
- my $b3 = ($in >> 8) & 0xff;
- my $b4 = $in & 0xff;
+ my $b1 = $in >> 24;
+ my $b2 = ($in >> 16) & 0xff;
+ my $b3 = ($in >> 8) & 0xff;
+ my $b4 = $in & 0xff;
$b4map{$b1}{$b2}{$b3}{$b4} = $out;
}
@@ -309,10 +313,14 @@ sub print_radix_table
###
# Add the segments for the radix trees themselves.
- push @segments, build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
- push @segments, build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
- push @segments, build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
- push @segments, build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);
+ push @segments,
+ build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
+ push @segments,
+ build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
+ push @segments,
+ build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
+ push @segments,
+ build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);
###
### Find min and max index used in each level of each tree.
@@ -325,23 +333,24 @@ sub print_radix_table
my %max_idx;
foreach my $seg (@segments)
{
- my $this_min = $min_idx{$seg->{depth}}->{$seg->{level}};
- my $this_max = $max_idx{$seg->{depth}}->{$seg->{level}};
+ my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} };
+ my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} };
- foreach my $i (keys %{$seg->{values}})
+ foreach my $i (keys %{ $seg->{values} })
{
$this_min = $i if (!defined $this_min || $i < $this_min);
$this_max = $i if (!defined $this_max || $i > $this_max);
}
- $min_idx{$seg->{depth}}{$seg->{level}} = $this_min;
- $max_idx{$seg->{depth}}{$seg->{level}} = $this_max;
+ $min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min;
+ $max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max;
}
+
# Copy the mins and max's back to every segment, for convenience.
foreach my $seg (@segments)
{
- $seg->{min_idx} = $min_idx{$seg->{depth}}{$seg->{level}};
- $seg->{max_idx} = $max_idx{$seg->{depth}}{$seg->{level}};
+ $seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} };
+ $seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} };
}
###
@@ -359,11 +368,10 @@ sub print_radix_table
$widest_range = $this_range if ($this_range > $widest_range);
}
- unshift @segments, {
- header => "Dummy map, for invalid values",
+ unshift @segments,
+ { header => "Dummy map, for invalid values",
min_idx => 0,
- max_idx => $widest_range
- };
+ max_idx => $widest_range };
###
### Eliminate overlapping zeros
@@ -378,26 +386,34 @@ sub print_radix_table
###
for (my $j = 0; $j < $#segments - 1; $j++)
{
- my $seg = $segments[$j];
- my $nextseg = $segments[$j + 1];
+ my $seg = $segments[$j];
+ my $nextseg = $segments[ $j + 1 ];
# Count the number of zero values at the end of this segment.
my $this_trail_zeros = 0;
- for (my $i = $seg->{max_idx}; $i >= $seg->{min_idx} && !$seg->{values}->{$i}; $i--)
+ for (
+ my $i = $seg->{max_idx};
+ $i >= $seg->{min_idx} && !$seg->{values}->{$i};
+ $i--)
{
$this_trail_zeros++;
}
# Count the number of zeros at the beginning of next segment.
my $next_lead_zeros = 0;
- for (my $i = $nextseg->{min_idx}; $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; $i++)
+ for (
+ my $i = $nextseg->{min_idx};
+ $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i};
+ $i++)
{
$next_lead_zeros++;
}
# How many zeros in common?
my $overlaid_trail_zeros =
- ($this_trail_zeros > $next_lead_zeros) ? $next_lead_zeros : $this_trail_zeros;
+ ($this_trail_zeros > $next_lead_zeros)
+ ? $next_lead_zeros
+ : $this_trail_zeros;
$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
@@ -419,7 +435,7 @@ sub print_radix_table
foreach my $seg (@segments)
{
$seg->{offset} = $flatoff;
- $segmap{$seg->{label}} = $flatoff;
+ $segmap{ $seg->{label} } = $flatoff;
$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
}
my $tblsize = $flatoff;
@@ -427,9 +443,9 @@ sub print_radix_table
# Second pass: look up the offset of each label reference in the hash.
foreach my $seg (@segments)
{
- while (my ($i, $val) = each %{$seg->{values}})
+ while (my ($i, $val) = each %{ $seg->{values} })
{
- if (!($val =~ /^[0-9,.E]+$/ ))
+ if (!($val =~ /^[0-9,.E]+$/))
{
my $segoff = $segmap{$val};
if ($segoff)
@@ -482,7 +498,7 @@ sub print_radix_table
my $max_val = 0;
foreach my $seg (@segments)
{
- foreach my $val (values %{$seg->{values}})
+ foreach my $val (values %{ $seg->{values} })
{
$max_val = $val if ($val > $max_val);
}
@@ -498,17 +514,17 @@ sub print_radix_table
if ($max_val <= 0xffff)
{
$vals_per_line = 8;
- $colwidth = 4;
+ $colwidth = 4;
}
elsif ($max_val <= 0xffffff)
{
$vals_per_line = 4;
- $colwidth = 6;
+ $colwidth = 6;
}
else
{
$vals_per_line = 4;
- $colwidth = 8;
+ $colwidth = 8;
}
###
@@ -529,17 +545,20 @@ sub print_radix_table
print $out " ${tblname}_table,\n";
}
printf $out "\n";
- printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", $b1root;
+ printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n",
+ $b1root;
printf $out " 0x%02x, /* b1_lower */\n", $b1_lower;
printf $out " 0x%02x, /* b1_upper */\n", $b1_upper;
printf $out "\n";
- printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", $b2root;
+ printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n",
+ $b2root;
printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower;
printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper;
printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower;
printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper;
printf $out "\n";
- printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b3root;
+ printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n",
+ $b3root;
printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower;
printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper;
printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower;
@@ -547,7 +566,8 @@ sub print_radix_table
printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower;
printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper;
printf $out "\n";
- printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b4root;
+ printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n",
+ $b4root;
printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower;
printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper;
printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower;
@@ -561,18 +581,21 @@ sub print_radix_table
print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
print $out "{";
my $off = 0;
+
foreach my $seg (@segments)
{
printf $out "\n";
printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
printf $out "\n";
- for (my $i=$seg->{min_idx}; $i <= $seg->{max_idx};)
+ for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};)
{
+
# Print the next line's worth of values.
# XXX pad to begin at a nice boundary
printf $out " /* %02x */ ", $i;
- for (my $j = 0; $j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
+ for (my $j = 0;
+ $j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
{
my $val = $seg->{values}->{$i};
@@ -588,7 +611,8 @@ sub print_radix_table
}
if ($seg->{overlaid_trail_zeros})
{
- printf $out " /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
+ printf $out
+" /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
}
}
@@ -607,13 +631,14 @@ sub build_segments_from_tree
if (%{$map})
{
- @segments = build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);
+ @segments =
+ build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);
# Sort the segments into "breadth-first" order. Not strictly required,
# but makes the maps nicer to read.
- @segments = sort { $a->{level} cmp $b->{level} or
- $a->{path} cmp $b->{path}}
- @segments;
+ @segments =
+ sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} }
+ @segments;
}
return @segments;
@@ -628,14 +653,13 @@ sub build_segments_recurse
if ($level == $depth)
{
- push @segments, {
- header => $header . ", leaf: ${path}xx",
- label => $label,
- level => $level,
- depth => $depth,
- path => $path,
- values => $map
- };
+ push @segments,
+ { header => $header . ", leaf: ${path}xx",
+ label => $label,
+ level => $level,
+ depth => $depth,
+ path => $path,
+ values => $map };
}
else
{
@@ -646,19 +670,19 @@ sub build_segments_recurse
my $childpath = $path . sprintf("%02x", $i);
my $childlabel = "$depth-level-$level-$childpath";
- push @segments, build_segments_recurse($header, $childlabel, $childpath,
- $level + 1, $depth, $val);
+ push @segments,
+ build_segments_recurse($header, $childlabel, $childpath,
+ $level + 1, $depth, $val);
$children{$i} = $childlabel;
}
- push @segments, {
- header => $header . ", byte #$level: ${path}xx",
- label => $label,
- level => $level,
- depth => $depth,
- path => $path,
- values => \%children
- };
+ push @segments,
+ { header => $header . ", byte #$level: ${path}xx",
+ label => $label,
+ level => $level,
+ depth => $depth,
+ path => $path,
+ values => \%children };
}
return @segments;
}
@@ -688,29 +712,31 @@ sub make_charmap
my %charmap;
foreach my $c (@$charset)
{
+
# combined characters are handled elsewhere
next if (defined $c->{ucs_second});
next if ($c->{direction} != $direction && $c->{direction} != BOTH);
my ($src, $dst) =
- $direction == TO_UNICODE
- ? ($c->{code}, ucs2utf($c->{ucs}))
- : (ucs2utf($c->{ucs}), $c->{code});
+ $direction == TO_UNICODE
+ ? ($c->{code}, ucs2utf($c->{ucs}))
+ : (ucs2utf($c->{ucs}), $c->{code});
# check for duplicate source codes
if (defined $charmap{$src})
{
printf STDERR
- "Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
- $c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
+"Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
+ $c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
exit;
}
$charmap{$src} = $dst;
if ($verbose)
{
- printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, $c->{l}, $c->{comment};
+ printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f},
+ $c->{l}, $c->{comment};
}
}
if ($verbose)
@@ -743,11 +769,13 @@ sub make_charmap_combined
if (defined $c->{ucs_second})
{
- my $entry = {utf8 => ucs2utf($c->{ucs}),
- utf8_second => ucs2utf($c->{ucs_second}),
- code => $c->{code},
- comment => $c->{comment},
- f => $c->{f}, l => $c->{l}};
+ my $entry = {
+ utf8 => ucs2utf($c->{ucs}),
+ utf8_second => ucs2utf($c->{ucs_second}),
+ code => $c->{code},
+ comment => $c->{comment},
+ f => $c->{f},
+ l => $c->{l} };
push @combined, $entry;
}
}