summaryrefslogtreecommitdiff
path: root/cpan/Encode
diff options
context:
space:
mode:
authorSteve Hay <steve.m.hay@googlemail.com>2020-10-12 13:36:53 +0100
committerSteve Hay <steve.m.hay@googlemail.com>2020-10-12 17:15:02 +0100
commit27ee53f9faf9484162127a9577d5e1e11be081b9 (patch)
tree27e5748c9afc0b0b390674becf61c62d729fabf9 /cpan/Encode
parentef977e200c13acac5a5bdac6641ebfcd6934c630 (diff)
downloadperl-27ee53f9faf9484162127a9577d5e1e11be081b9.tar.gz
Update Encode from version 3.06 to 3.07
Diffstat (limited to 'cpan/Encode')
-rw-r--r--cpan/Encode/Encode.pm4
-rw-r--r--cpan/Encode/lib/Encode/GSM0338.pm160
-rw-r--r--cpan/Encode/t/gsm0338.t30
3 files changed, 90 insertions, 104 deletions
diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm
index de06ba149e..77ca93e87b 100644
--- a/cpan/Encode/Encode.pm
+++ b/cpan/Encode/Encode.pm
@@ -1,5 +1,5 @@
#
-# $Id: Encode.pm,v 3.06 2020/05/02 02:31:14 dankogai Exp $
+# $Id: Encode.pm,v 3.07 2020/07/25 12:59:10 dankogai Exp $
#
package Encode;
use strict;
@@ -7,7 +7,7 @@ use warnings;
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
our $VERSION;
BEGIN {
- $VERSION = sprintf "%d.%02d", q$Revision: 3.06 $ =~ /(\d+)/g;
+ $VERSION = sprintf "%d.%02d", q$Revision: 3.07 $ =~ /(\d+)/g;
require XSLoader;
XSLoader::load( __PACKAGE__, $VERSION );
}
diff --git a/cpan/Encode/lib/Encode/GSM0338.pm b/cpan/Encode/lib/Encode/GSM0338.pm
index e87141ebc4..8b23a7bb6a 100644
--- a/cpan/Encode/lib/Encode/GSM0338.pm
+++ b/cpan/Encode/lib/Encode/GSM0338.pm
@@ -1,5 +1,5 @@
#
-# $Id: GSM0338.pm,v 2.7 2017/06/10 17:23:50 dankogai Exp $
+# $Id: GSM0338.pm,v 2.8 2020/07/25 12:59:29 dankogai Exp dankogai $
#
package Encode::GSM0338;
@@ -8,7 +8,7 @@ use warnings;
use Carp;
use vars qw($VERSION);
-$VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+$VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use Encode qw(:fallbacks);
@@ -19,8 +19,10 @@ sub needs_lines { 1 }
sub perlio_ok { 0 }
use utf8;
+
+# Mapping table according to 3GPP TS 23.038 version 16.0.0 Release 16 and ETSI TS 123 038 V16.0.0 (2020-07)
+# https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf (page 20 and 22)
our %UNI2GSM = (
- "\x{0040}" => "\x00", # COMMERCIAL AT
"\x{000A}" => "\x0A", # LINE FEED
"\x{000C}" => "\x1B\x0A", # FORM FEED
"\x{000D}" => "\x0D", # CARRIAGE RETURN
@@ -56,6 +58,7 @@ our %UNI2GSM = (
"\x{003D}" => "\x3D", # EQUALS SIGN
"\x{003E}" => "\x3E", # GREATER-THAN SIGN
"\x{003F}" => "\x3F", # QUESTION MARK
+ "\x{0040}" => "\x00", # COMMERCIAL AT
"\x{0041}" => "\x41", # LATIN CAPITAL LETTER A
"\x{0042}" => "\x42", # LATIN CAPITAL LETTER B
"\x{0043}" => "\x43", # LATIN CAPITAL LETTER C
@@ -82,6 +85,10 @@ our %UNI2GSM = (
"\x{0058}" => "\x58", # LATIN CAPITAL LETTER X
"\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y
"\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z
+ "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET
+ "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS
+ "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET
+ "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT
"\x{005F}" => "\x11", # LOW LINE
"\x{0061}" => "\x61", # LATIN SMALL LETTER A
"\x{0062}" => "\x62", # LATIN SMALL LETTER B
@@ -109,16 +116,10 @@ our %UNI2GSM = (
"\x{0078}" => "\x78", # LATIN SMALL LETTER X
"\x{0079}" => "\x79", # LATIN SMALL LETTER Y
"\x{007A}" => "\x7A", # LATIN SMALL LETTER Z
- "\x{000C}" => "\x1B\x0A", # FORM FEED
- "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET
- "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS
- "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET
- "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT
"\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET
"\x{007C}" => "\x1B\x40", # VERTICAL LINE
"\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET
"\x{007E}" => "\x1B\x3D", # TILDE
- "\x{00A0}" => "\x1B", # NO-BREAK SPACE
"\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK
"\x{00A3}" => "\x01", # POUND SIGN
"\x{00A4}" => "\x24", # CURRENCY SIGN
@@ -128,6 +129,7 @@ our %UNI2GSM = (
"\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS
"\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE
"\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE
+ "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA
"\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE
"\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE
"\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS
@@ -138,8 +140,6 @@ our %UNI2GSM = (
"\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS
"\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE
"\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE
- #"\x{00E7}" => "\x09", # LATIN SMALL LETTER C WITH CEDILLA
- "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA
"\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE
"\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE
"\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE
@@ -163,58 +163,38 @@ our %UNI2GSM = (
);
our %GSM2UNI = reverse %UNI2GSM;
our $ESC = "\x1b";
-our $ATMARK = "\x40";
-our $FBCHAR = "\x3F";
-our $NBSP = "\x{00A0}";
-
-#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
sub decode ($$;$) {
my ( $obj, $bytes, $chk ) = @_;
return undef unless defined $bytes;
my $str = substr($bytes, 0, 0); # to propagate taintedness;
while ( length $bytes ) {
- my $c = substr( $bytes, 0, 1, '' );
- my $u;
- if ( $c eq "\x00" ) {
- my $c2 = substr( $bytes, 0, 1, '' );
- $u =
- !length $c2 ? $ATMARK
- : $c2 eq "\x00" ? "\x{0000}"
- : exists $GSM2UNI{$c2} ? $ATMARK . $GSM2UNI{$c2}
- : $chk
- ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode",
- ord($c), ord($c2) )
- : $ATMARK . $FBCHAR;
-
- }
- elsif ( $c eq $ESC ) {
- my $c2 = substr( $bytes, 0, 1, '' );
- $u =
- exists $GSM2UNI{ $c . $c2 } ? $GSM2UNI{ $c . $c2 }
- : exists $GSM2UNI{$c2} ? $NBSP . $GSM2UNI{$c2}
- : $chk
- ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode",
- ord($c), ord($c2) )
- : $NBSP . $FBCHAR;
- }
- else {
- $u =
- exists $GSM2UNI{$c}
- ? $GSM2UNI{$c}
- : $chk ? ref $chk eq 'CODE'
- ? $chk->( ord $c )
- : croak sprintf( "\\x%02X does not map to Unicode", ord($c) )
- : $FBCHAR;
+ my $seq = '';
+ my $c;
+ do {
+ $c = substr( $bytes, 0, 1, '' );
+ $seq .= $c;
+ } while ( length $bytes and $c eq $ESC );
+ my $u =
+ exists $GSM2UNI{$seq}
+ ? $GSM2UNI{$seq}
+ : ($chk && ref $chk eq 'CODE')
+ ? $chk->( unpack 'C*', $seq )
+ : "\x{FFFD}";
+ if ( not exists $GSM2UNI{$seq} and $chk and not ref $chk ) {
+ croak join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) . ' does not map to Unicode' if $chk & Encode::DIE_ON_ERR;
+ carp join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) . ' does not map to Unicode' if $chk & Encode::WARN_ON_ERR;
+ if ($chk & Encode::RETURN_ON_ERR) {
+ $bytes .= $seq;
+ last;
+ }
}
$str .= $u;
}
- $_[1] = $bytes if $chk;
+ $_[1] = $bytes if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
return $str;
}
-#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
-
sub encode($$;$) {
my ( $obj, $str, $chk ) = @_;
return undef unless defined $str;
@@ -222,16 +202,23 @@ sub encode($$;$) {
while ( length $str ) {
my $u = substr( $str, 0, 1, '' );
my $c;
- $bytes .=
- exists $UNI2GSM{$u}
- ? $UNI2GSM{$u}
- : $chk ? ref $chk eq 'CODE'
- ? $chk->( ord($u) )
- : croak sprintf( "\\x{%04x} does not map to %s",
- ord($u), $obj->name )
- : $FBCHAR;
+ my $seq =
+ exists $UNI2GSM{$u}
+ ? $UNI2GSM{$u}
+ : ($chk && ref $chk eq 'CODE')
+ ? $chk->( ord($u) )
+ : $UNI2GSM{'?'};
+ if ( not exists $UNI2GSM{$u} and $chk and not ref $chk ) {
+ croak sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) if $chk & Encode::DIE_ON_ERR;
+ carp sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) if $chk & Encode::WARN_ON_ERR;
+ if ($chk & Encode::RETURN_ON_ERR) {
+ $str .= $u;
+ last;
+ }
+ }
+ $bytes .= $seq;
}
- $_[1] = $str if $chk;
+ $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
return $bytes;
}
@@ -240,13 +227,13 @@ __END__
=head1 NAME
-Encode::GSM0338 -- ESTI GSM 03.38 Encoding
+Encode::GSM0338 -- ETSI GSM 03.38 Encoding
=head1 SYNOPSIS
- use Encode qw/encode decode/;
- $gsm0338 = encode("gsm0338", $utf8); # loads Encode::GSM0338 implicitly
- $utf8 = decode("gsm0338", $gsm0338); # ditto
+ use Encode qw/encode decode/;
+ $gsm0338 = encode("gsm0338", $unicode); # loads Encode::GSM0338 implicitly
+ $unicode = decode("gsm0338", $gsm0338); # ditto
=head1 DESCRIPTION
@@ -259,35 +246,42 @@ This was once handled by L<Encode::Bytes> but because of all those
unusual specifications, Encode 2.20 has relocated the support to
this module.
-=head1 NOTES
+This module implements only I<GSM 7 bit Default Alphabet> and
+I<GSM 7 bit default alphabet extension table> according to standard
+3GPP TS 23.038 version 16. Therefore I<National Language Single Shift>
+and I<National Language Locking Shift> are not implemented nor supported.
-Unlike most other encodings, the following always croaks on error
-for any $chk that evaluates to true.
+=head2 Septets
- $gsm0338 = encode("gsm0338", $utf8 $chk);
- $utf8 = decode("gsm0338", $gsm0338, $chk);
+This modules operates with octets (like any other Encode module) and not
+with packed septets (unlike other GSM standards). Therefore for processing
+binary SMS or parts of GSM TPDU payload (3GPP TS 23.040) it is needed to do
+conversion between octets and packed septets. For this purpose perl's C<pack>
+and C<unpack> functions may be useful:
-So if you want to check the validity of the encoding, surround the
-expression with C<eval {}> block as follows;
+ $bytes = substr(pack('(b*)*', unpack '(A7)*', unpack 'b*', $septets), 0, $num_of_septets);
+ $unicode = decode('GSM0338', $bytes);
- eval {
- $utf8 = decode("gsm0338", $gsm0338, $chk);
- } or do {
- # handle exception here
- };
+ $bytes = encode('GSM0338', $unicode);
+ $septets = pack 'b*', join '', map { substr $_, 0, 7 } unpack '(A8)*', unpack 'b*', $bytes;
+ $num_of_septets = length $bytes;
-=head1 BUGS
+Please note that for correct decoding of packed septets it is required to
+know number of septets packed in binary buffer as binary buffer is always
+padded with zero bits and 7 zero bits represents character C<@>. Number
+of septets is also stored in TPDU payload when dealing with 3GPP TS 23.040.
-ESTI GSM 03.38 Encoding itself.
+=head1 BUGS
-Mapping \x00 to '@' causes too much pain everywhere.
+Encode::GSM0338 2.7 and older versions (part of Encode 3.06) incorrectly
+handled zero bytes (character C<@>). This was fixed in Encode::GSM0338
+version 2.8 (part of Encode 3.07).
-Its use of \x1b (escape) is also very questionable.
+=head1 SEE ALSO
-Because of those two, the code paging approach used use in ucm-based
-Encoding SOMETIMES fails so this module was written.
+L<3GPP TS 23.038|https://www.3gpp.org/dynareport/23038.htm>
-=head1 SEE ALSO
+L<ETSI TS 123 038 V16.0.0 (2020-07)|https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf>
L<Encode>
diff --git a/cpan/Encode/t/gsm0338.t b/cpan/Encode/t/gsm0338.t
index 127604b461..21a82fad5e 100644
--- a/cpan/Encode/t/gsm0338.t
+++ b/cpan/Encode/t/gsm0338.t
@@ -13,21 +13,15 @@ BEGIN {
use strict;
use utf8;
-use Test::More tests => 780;
+use Test::More tests => 776;
use Encode;
use Encode::GSM0338;
-# The specification of GSM 03.38 is not awfully clear.
-# (http://www.unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT)
-# The various combinations of 0x00 and 0x1B as leading bytes
-# are unclear, as is the semantics of those bytes as standalone
-# or as final single bytes.
-
-
my $chk = Encode::LEAVE_SRC();
# escapes
-# see http://www.csoft.co.uk/sms/character_sets/gsm.htm
+# see https://www.3gpp.org/dynareport/23038.htm
+# see https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/15.00.00_60/ts_123038v150000p.pdf (page 22)
my %esc_seq = (
"\x{20ac}" => "\x1b\x65",
"\x0c" => "\x1b\x0A",
@@ -51,26 +45,20 @@ sub eu{
}
for my $c ( map { chr } 0 .. 127 ) {
+ next if $c eq "\x1B"; # escape character, start of multibyte sequence
my $u = $Encode::GSM0338::GSM2UNI{$c};
# default character set
is decode( "gsm0338", $c, $chk ), $u,
sprintf( "decode \\x%02X", ord($c) );
- eval { decode( "gsm0338", $c . "\xff", $chk ) };
+ eval { decode( "gsm0338", $c . "\xff", $chk | Encode::FB_CROAK ) };
ok( $@, $@ );
is encode( "gsm0338", $u, $chk ), $c, sprintf( "encode %s", eu($u) );
- eval { encode( "gsm0338", $u . "\x{3000}", $chk ) };
+ eval { encode( "gsm0338", $u . "\x{3000}", $chk | Encode::FB_CROAK ) };
ok( $@, $@ );
- # nasty atmark
- if ( $c eq "\x00" ) {
- is decode( "gsm0338", "\x00" . $c, $chk ), "\x00",
- sprintf( '@@ =>: \x00+\x%02X', ord($c) );
- }
- else {
is decode( "gsm0338", "\x00" . $c ), '@' . decode( "gsm0338", $c ),
sprintf( '@: decode \x00+\x%02X', ord($c) );
- }
# escape seq.
my $ecs = "\x1b" . $c;
@@ -82,7 +70,7 @@ for my $c ( map { chr } 0 .. 127 ) {
}
else {
is decode( "gsm0338", $ecs, $chk ),
- "\xA0" . decode( "gsm0338", $c ),
+ "\x{FFFD}",
sprintf( "decode ESC+\\x%02X", ord($c) );
}
}
@@ -91,6 +79,10 @@ for my $c ( map { chr } 0 .. 127 ) {
is decode("gsm0338", "\x09") => chr(0xC7), 'RT75670: decode';
is encode("gsm0338", chr(0xC7)) => "\x09", 'RT75670: encode';
+# https://rt.cpan.org/Public/Bug/Display.html?id=124571
+is decode("gsm0338", encode('gsm0338', '..@@..')), '..@@..';
+is decode("gsm0338", encode('gsm0338', '..@€..')), '..@€..';
+
__END__
for my $c (map { chr } 0..127){
my $b = "\x1b$c";