diff options
author | Steve Hay <steve.m.hay@googlemail.com> | 2020-10-12 13:36:53 +0100 |
---|---|---|
committer | Steve Hay <steve.m.hay@googlemail.com> | 2020-10-12 17:15:02 +0100 |
commit | 27ee53f9faf9484162127a9577d5e1e11be081b9 (patch) | |
tree | 27e5748c9afc0b0b390674becf61c62d729fabf9 /cpan/Encode/lib | |
parent | ef977e200c13acac5a5bdac6641ebfcd6934c630 (diff) | |
download | perl-27ee53f9faf9484162127a9577d5e1e11be081b9.tar.gz |
Update Encode from version 3.06 to 3.07
Diffstat (limited to 'cpan/Encode/lib')
-rw-r--r-- | cpan/Encode/lib/Encode/GSM0338.pm | 160 |
1 files changed, 77 insertions, 83 deletions
diff --git a/cpan/Encode/lib/Encode/GSM0338.pm b/cpan/Encode/lib/Encode/GSM0338.pm index e87141ebc4..8b23a7bb6a 100644 --- a/cpan/Encode/lib/Encode/GSM0338.pm +++ b/cpan/Encode/lib/Encode/GSM0338.pm @@ -1,5 +1,5 @@ # -# $Id: GSM0338.pm,v 2.7 2017/06/10 17:23:50 dankogai Exp $ +# $Id: GSM0338.pm,v 2.8 2020/07/25 12:59:29 dankogai Exp dankogai $ # package Encode::GSM0338; @@ -8,7 +8,7 @@ use warnings; use Carp; use vars qw($VERSION); -$VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +$VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -19,8 +19,10 @@ sub needs_lines { 1 } sub perlio_ok { 0 } use utf8; + +# Mapping table according to 3GPP TS 23.038 version 16.0.0 Release 16 and ETSI TS 123 038 V16.0.0 (2020-07) +# https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf (page 20 and 22) our %UNI2GSM = ( - "\x{0040}" => "\x00", # COMMERCIAL AT "\x{000A}" => "\x0A", # LINE FEED "\x{000C}" => "\x1B\x0A", # FORM FEED "\x{000D}" => "\x0D", # CARRIAGE RETURN @@ -56,6 +58,7 @@ our %UNI2GSM = ( "\x{003D}" => "\x3D", # EQUALS SIGN "\x{003E}" => "\x3E", # GREATER-THAN SIGN "\x{003F}" => "\x3F", # QUESTION MARK + "\x{0040}" => "\x00", # COMMERCIAL AT "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C @@ -82,6 +85,10 @@ our %UNI2GSM = ( "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z + "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET + "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS + "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET + "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT "\x{005F}" => "\x11", # LOW LINE "\x{0061}" => "\x61", # LATIN SMALL LETTER A "\x{0062}" => "\x62", # LATIN SMALL LETTER B @@ -109,16 +116,10 @@ our %UNI2GSM = ( "\x{0078}" => "\x78", # LATIN SMALL LETTER X "\x{0079}" => "\x79", # LATIN SMALL LETTER Y "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z - "\x{000C}" => "\x1B\x0A", # FORM FEED - "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET - "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS - "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET - "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET "\x{007C}" => "\x1B\x40", # VERTICAL LINE "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET "\x{007E}" => "\x1B\x3D", # TILDE - "\x{00A0}" => "\x1B", # NO-BREAK SPACE "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK "\x{00A3}" => "\x01", # POUND SIGN "\x{00A4}" => "\x24", # CURRENCY SIGN @@ -128,6 +129,7 @@ our %UNI2GSM = ( "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE + "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS @@ -138,8 +140,6 @@ our %UNI2GSM = ( "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE - #"\x{00E7}" => "\x09", # LATIN SMALL LETTER C WITH CEDILLA - "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE @@ -163,58 +163,38 @@ our %UNI2GSM = ( ); our %GSM2UNI = reverse %UNI2GSM; our $ESC = "\x1b"; -our $ATMARK = "\x40"; -our $FBCHAR = "\x3F"; -our $NBSP = "\x{00A0}"; - -#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" sub decode ($$;$) { my ( $obj, $bytes, $chk ) = @_; return undef unless defined $bytes; my $str = substr($bytes, 0, 0); # to propagate taintedness; while ( length $bytes ) { - my $c = substr( $bytes, 0, 1, '' ); - my $u; - if ( $c eq "\x00" ) { - my $c2 = substr( $bytes, 0, 1, '' ); - $u = - !length $c2 ? $ATMARK - : $c2 eq "\x00" ? "\x{0000}" - : exists $GSM2UNI{$c2} ? $ATMARK . $GSM2UNI{$c2} - : $chk - ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", - ord($c), ord($c2) ) - : $ATMARK . $FBCHAR; - - } - elsif ( $c eq $ESC ) { - my $c2 = substr( $bytes, 0, 1, '' ); - $u = - exists $GSM2UNI{ $c . $c2 } ? $GSM2UNI{ $c . $c2 } - : exists $GSM2UNI{$c2} ? $NBSP . $GSM2UNI{$c2} - : $chk - ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", - ord($c), ord($c2) ) - : $NBSP . $FBCHAR; - } - else { - $u = - exists $GSM2UNI{$c} - ? $GSM2UNI{$c} - : $chk ? ref $chk eq 'CODE' - ? $chk->( ord $c ) - : croak sprintf( "\\x%02X does not map to Unicode", ord($c) ) - : $FBCHAR; + my $seq = ''; + my $c; + do { + $c = substr( $bytes, 0, 1, '' ); + $seq .= $c; + } while ( length $bytes and $c eq $ESC ); + my $u = + exists $GSM2UNI{$seq} + ? $GSM2UNI{$seq} + : ($chk && ref $chk eq 'CODE') + ? $chk->( unpack 'C*', $seq ) + : "\x{FFFD}"; + if ( not exists $GSM2UNI{$seq} and $chk and not ref $chk ) { + croak join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) . ' does not map to Unicode' if $chk & Encode::DIE_ON_ERR; + carp join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) . ' does not map to Unicode' if $chk & Encode::WARN_ON_ERR; + if ($chk & Encode::RETURN_ON_ERR) { + $bytes .= $seq; + last; + } } $str .= $u; } - $_[1] = $bytes if $chk; + $_[1] = $bytes if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); return $str; } -#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" - sub encode($$;$) { my ( $obj, $str, $chk ) = @_; return undef unless defined $str; @@ -222,16 +202,23 @@ sub encode($$;$) { while ( length $str ) { my $u = substr( $str, 0, 1, '' ); my $c; - $bytes .= - exists $UNI2GSM{$u} - ? $UNI2GSM{$u} - : $chk ? ref $chk eq 'CODE' - ? $chk->( ord($u) ) - : croak sprintf( "\\x{%04x} does not map to %s", - ord($u), $obj->name ) - : $FBCHAR; + my $seq = + exists $UNI2GSM{$u} + ? $UNI2GSM{$u} + : ($chk && ref $chk eq 'CODE') + ? $chk->( ord($u) ) + : $UNI2GSM{'?'}; + if ( not exists $UNI2GSM{$u} and $chk and not ref $chk ) { + croak sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) if $chk & Encode::DIE_ON_ERR; + carp sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) if $chk & Encode::WARN_ON_ERR; + if ($chk & Encode::RETURN_ON_ERR) { + $str .= $u; + last; + } + } + $bytes .= $seq; } - $_[1] = $str if $chk; + $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); return $bytes; } @@ -240,13 +227,13 @@ __END__ =head1 NAME -Encode::GSM0338 -- ESTI GSM 03.38 Encoding +Encode::GSM0338 -- ETSI GSM 03.38 Encoding =head1 SYNOPSIS - use Encode qw/encode decode/; - $gsm0338 = encode("gsm0338", $utf8); # loads Encode::GSM0338 implicitly - $utf8 = decode("gsm0338", $gsm0338); # ditto + use Encode qw/encode decode/; + $gsm0338 = encode("gsm0338", $unicode); # loads Encode::GSM0338 implicitly + $unicode = decode("gsm0338", $gsm0338); # ditto =head1 DESCRIPTION @@ -259,35 +246,42 @@ This was once handled by L<Encode::Bytes> but because of all those unusual specifications, Encode 2.20 has relocated the support to this module. -=head1 NOTES +This module implements only I<GSM 7 bit Default Alphabet> and +I<GSM 7 bit default alphabet extension table> according to standard +3GPP TS 23.038 version 16. Therefore I<National Language Single Shift> +and I<National Language Locking Shift> are not implemented nor supported. -Unlike most other encodings, the following always croaks on error -for any $chk that evaluates to true. +=head2 Septets - $gsm0338 = encode("gsm0338", $utf8 $chk); - $utf8 = decode("gsm0338", $gsm0338, $chk); +This modules operates with octets (like any other Encode module) and not +with packed septets (unlike other GSM standards). Therefore for processing +binary SMS or parts of GSM TPDU payload (3GPP TS 23.040) it is needed to do +conversion between octets and packed septets. For this purpose perl's C<pack> +and C<unpack> functions may be useful: -So if you want to check the validity of the encoding, surround the -expression with C<eval {}> block as follows; + $bytes = substr(pack('(b*)*', unpack '(A7)*', unpack 'b*', $septets), 0, $num_of_septets); + $unicode = decode('GSM0338', $bytes); - eval { - $utf8 = decode("gsm0338", $gsm0338, $chk); - } or do { - # handle exception here - }; + $bytes = encode('GSM0338', $unicode); + $septets = pack 'b*', join '', map { substr $_, 0, 7 } unpack '(A8)*', unpack 'b*', $bytes; + $num_of_septets = length $bytes; -=head1 BUGS +Please note that for correct decoding of packed septets it is required to +know number of septets packed in binary buffer as binary buffer is always +padded with zero bits and 7 zero bits represents character C<@>. Number +of septets is also stored in TPDU payload when dealing with 3GPP TS 23.040. -ESTI GSM 03.38 Encoding itself. +=head1 BUGS -Mapping \x00 to '@' causes too much pain everywhere. +Encode::GSM0338 2.7 and older versions (part of Encode 3.06) incorrectly +handled zero bytes (character C<@>). This was fixed in Encode::GSM0338 +version 2.8 (part of Encode 3.07). -Its use of \x1b (escape) is also very questionable. +=head1 SEE ALSO -Because of those two, the code paging approach used use in ucm-based -Encoding SOMETIMES fails so this module was written. +L<3GPP TS 23.038|https://www.3gpp.org/dynareport/23038.htm> -=head1 SEE ALSO +L<ETSI TS 123 038 V16.0.0 (2020-07)|https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf> L<Encode> |