summaryrefslogtreecommitdiff
path: root/rtl/inc
diff options
context:
space:
mode:
authormarco <marco@3ad0048d-3df7-0310-abae-a5850022a9f2>2015-04-12 14:20:56 +0000
committermarco <marco@3ad0048d-3df7-0310-abae-a5850022a9f2>2015-04-12 14:20:56 +0000
commit499c6d7036c66e245c3be12629d931586e2be8a0 (patch)
treead492c2c1f80e8d6a19e20f9d643e8e19d78f29c /rtl/inc
parent859a502c7c5b3966e02818d90d6b0f47b6722745 (diff)
downloadfpc-499c6d7036c66e245c3be12629d931586e2be8a0.tar.gz
--- Merging r30047 into '.':
U rtl/java/jsystemh.inc U rtl/inc/generic.inc U rtl/inc/systemh.inc A tests/test/tutf8cpl.pp # revisions: 30047 git-svn-id: http://svn.freepascal.org/svn/fpc/branches/fixes_3_0@30552 3ad0048d-3df7-0310-abae-a5850022a9f2
Diffstat (limited to 'rtl/inc')
-rw-r--r--rtl/inc/generic.inc138
-rw-r--r--rtl/inc/systemh.inc8
2 files changed, 146 insertions, 0 deletions
diff --git a/rtl/inc/generic.inc b/rtl/inc/generic.inc
index 4774d639e4..330082f038 100644
--- a/rtl/inc/generic.inc
+++ b/rtl/inc/generic.inc
@@ -1076,6 +1076,144 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
{$endif not cpujvm}
+
+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
+ var
+ bytes: sizeint;
+ firstzerobit: byte;
+ begin
+ { see https://en.wikipedia.org/wiki/UTF-8#Description for details }
+
+ if maxlookahead<=0 then
+ begin
+ { incomplete }
+ result:=0;
+ exit;
+ end;
+ { inlcude the first byte }
+ result:=1;
+ { multiple byte utf-8 code point? }
+ if p[0]>#127 then
+ begin
+ { bsr searches for the leftmost 1 bit. We are interested in the
+ leftmost 0 bit, so first invert the value
+ }
+ firstzerobit:=bsrbyte(not(byte(p[0])));
+ { if there is no zero bit or the first zero bit is the rightmost bit
+ (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
+ UTF-8-encoded string, and in the worst case bit 1 has to be zero)
+ Additionally, 5-byte UTF-8 sequences don't exist either, so bit 1
+ cannot be the first zero-bit either. And bits 6 and 7 can't be 0
+ either in the first byte.
+ }
+ if (firstzerobit<=1) or (firstzerobit>=6) then
+ begin
+ result:=-result;
+ exit;
+ end;
+ { the number of bytes belonging to this code point is
+ 7-(pos first 0-bit). Subtract 1 since we're already at the first
+ byte. All subsequent bytes of the same sequence must have their
+ highest bit set and the next one unset. We stop when we detect an
+ invalid sequence.
+ }
+ bytes:=6-firstzerobit;
+ while (result<maxlookahead) and
+ (bytes>0) and
+ ((ord(p[result]) and %11000000)=%10000000) do
+ begin
+ inc(result);
+ dec(bytes);
+ end;
+ { stopped because of invalid/incomplete sequence -> exit }
+ if bytes<>0 then
+ begin
+ if result>=maxlookahead then
+ result:=0
+ else
+ result:=-result;
+ exit;
+ end;
+ end;
+ if includecombiningdiacriticalmarks then
+ begin
+ { combining diacritical marks?
+ 1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111
+ 2) U+1AB0 - U+1AFF in UTF-8 = %11100001 10101010 10110000 - %11100001 10101011 10111111
+ 3) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111
+ 4) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111
+ 5) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111
+ }
+ repeat
+ bytes:=result;
+ if result+1<maxlookahead then
+ begin
+ { case 1) }
+ if ((ord(p[result]) and %11001100=%11001100)) and
+ (ord(p[result+1])>=%10000000) and
+ (ord(p[result+1])<=%10101111) then
+ inc(result,2)
+ { case 2), 3), 4), 5) }
+ else if (result+2<maxlookahead) and
+ (ord(p[result])>=%11100001) then
+ begin
+ { case 2) }
+ if ((ord(p[result])=%11100001) and
+ (ord(p[result+1])=%10101010) and
+ (ord(p[result+2])>=%10110000) and
+ (ord(p[result+2])<=%10111111)) or
+ { case 3) }
+ ((ord(p[result])=%11100001) and
+ (ord(p[result+1])=%10110111) and
+ (ord(p[result+2])>=%10000000) and
+ (ord(p[result+2])<=%10111111)) or
+ { case 4) }
+ ((ord(p[result])=%11100010) and
+ (ord(p[result+1])=%10000011) and
+ (ord(p[result+2])>=%10010000) and
+ (ord(p[result+2])<=%10111111)) or
+ { case 5) }
+ ((ord(p[result])=%11101111) and
+ (ord(p[result+1])=%10111000) and
+ (ord(p[result+2])>=%10100000) and
+ (ord(p[result+2])<=%10101111)) then
+ inc(result,3);
+ end;
+ end;
+ until bytes=result;
+ { is there an incomplete diacritical mark? (invalid makes little sense:
+ either a sequence is a combining diacritical mark, or it's not ; if
+ it's invalid, it may also not have been a combining diacritical mark)
+ }
+ if result<maxlookahead then
+ begin
+ { case 1) }
+ if (((ord(p[result]) and %11001100=%11001100)) and
+ (result+1>=maxlookahead)) or
+ { case 2) and 3)}
+ ((ord(p[result])=%11100001) and
+ ((result+1>=maxlookahead) or
+ (((ord(p[result+1])=%10101010) or
+ (ord(p[result+1])=%10110111)) and
+ (result+2>=maxlookahead)))) or
+ { case 4 }
+ ((ord(p[result])=%11100010) and
+ ((result+1>=maxlookahead) or
+ ((ord(p[result+1])=%10000011) and
+ (result+2>=maxlookahead)))) or
+ { case 5 }
+ ((ord(p[result])=%11101111) and
+ ((result+1>=maxlookahead) or
+ ((ord(p[result+1])=%10111000) and
+ (result+2>=maxlookahead)))) then
+ begin
+ result:=0;
+ exit;
+ end;
+ end;
+ end;
+ end;
+
{$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR}
procedure fpc_chararray_to_shortstr(out res : shortstring;const arr: array of char; zerobased: boolean = true);[public,alias:'FPC_CHARARRAY_TO_SHORTSTR']; compilerproc;
diff --git a/rtl/inc/systemh.inc b/rtl/inc/systemh.inc
index 69b489ad9f..72b3a858f6 100644
--- a/rtl/inc/systemh.inc
+++ b/rtl/inc/systemh.inc
@@ -1065,6 +1065,14 @@ Function Sseg:Word;{$ifdef SYSTEMINLINE}inline;{$endif}
function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
+{ result:
+ <0: invalid sequence detected after processing "-result" bytes
+ 0: incomplete (may still be valid if MaxLookAhead is increased)
+ >0: sequence of result bytes forms a codepoint (+ combining diacritics if that
+ parameter was true)
+}
+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
+
{ Shortstring functions }
Procedure Delete(var s:shortstring;index:SizeInt;count:SizeInt);
Procedure Insert(const source:shortstring;var s:shortstring;index:SizeInt);