path: root/utils/unicode
diff options
authornickysn <nickysn@3ad0048d-3df7-0310-abae-a5850022a9f2>2021-02-19 19:32:10 +0000
committernickysn <nickysn@3ad0048d-3df7-0310-abae-a5850022a9f2>2021-02-19 19:32:10 +0000
commite5ae4a829c1d2f4c18d99156ff2c32ec52e3fd53 (patch)
tree9bac9978bebdd0e67a845e2120cf851bb3f5cd50 /utils/unicode
parent0330faed56e71ea899352863e53ba6a4dec24226 (diff)
+ added tool for parsing GraphemeBreakProperty.txt and converting it to code
git-svn-id: 3ad0048d-3df7-0310-abae-a5850022a9f2
Diffstat (limited to 'utils/unicode')
2 files changed, 437 insertions, 0 deletions
diff --git a/utils/unicode/gbpparser.lpi b/utils/unicode/gbpparser.lpi
new file mode 100644
index 0000000000..940fae4e66
--- /dev/null
+++ b/utils/unicode/gbpparser.lpi
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+ <ProjectOptions>
+ <Version Value="11"/>
+ <General>
+ <Flags>
+ <MainUnitHasCreateFormStatements Value="False"/>
+ <MainUnitHasTitleStatement Value="False"/>
+ <MainUnitHasScaledStatement Value="False"/>
+ </Flags>
+ <SessionStorage Value="InProjectDir"/>
+ <MainUnit Value="0"/>
+ <Title Value="gbpparser"/>
+ <UseAppBundle Value="False"/>
+ <ResourceType Value="res"/>
+ </General>
+ <BuildModes Count="1">
+ <Item1 Name="Default" Default="True"/>
+ </BuildModes>
+ <PublishOptions>
+ <Version Value="2"/>
+ <UseFileFilters Value="True"/>
+ </PublishOptions>
+ <RunParams>
+ <FormatVersion Value="2"/>
+ <Modes Count="0"/>
+ </RunParams>
+ <Units Count="1">
+ <Unit0>
+ <Filename Value="gbpparser.lpr"/>
+ <IsPartOfProject Value="True"/>
+ </Unit0>
+ </Units>
+ </ProjectOptions>
+ <CompilerOptions>
+ <Version Value="11"/>
+ <Target>
+ <Filename Value="gbpparser"/>
+ </Target>
+ <SearchPaths>
+ <IncludeFiles Value="$(ProjOutDir)"/>
+ <UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/>
+ </SearchPaths>
+ </CompilerOptions>
+ <Debugging>
+ <Exceptions Count="3">
+ <Item1>
+ <Name Value="EAbort"/>
+ </Item1>
+ <Item2>
+ <Name Value="ECodetoolError"/>
+ </Item2>
+ <Item3>
+ <Name Value="EFOpenError"/>
+ </Item3>
+ </Exceptions>
+ </Debugging>
diff --git a/utils/unicode/gbpparser.lpr b/utils/unicode/gbpparser.lpr
new file mode 100644
index 0000000000..1d47fa4479
--- /dev/null
+++ b/utils/unicode/gbpparser.lpr
@@ -0,0 +1,379 @@
+{ Parser and code generator for the GraphemeBreakProperty.
+ Copyright (C) 2021 Nikolay Nikolov <>
+ This source is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your option)
+ any later version.
+ This code is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ details.
+ A copy of the GNU General Public License is available on the World Wide Web
+ at <>. You can also obtain it by writing
+ to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+ Boston, MA 02110-1335, USA.
+program gbpparser;
+{$mode objfpc}{$H+}
+ SysUtils, StrUtils;
+ TGraphemeBreakProperty = (
+ gbpOther,
+ gbpPrepend,
+ gbpCR,
+ gbpLF,
+ gbpControl,
+ gbpExtend,
+ gpbRegional_Indicator,
+ gbpSpacingMark,
+ gbpL,
+ gbpV,
+ gbpT,
+ gbpLV,
+ gbpLVT,
+ gbpE_Base,
+ gbpE_Modifier,
+ gbpZWJ,
+ gbpGlue_After_Zwj,
+ gbpE_Base_GAZ);
+ TRange = record
+ RangeLo, RangeHi: UCS4Char;
+ end;
+ TRanges = array of TRange;
+ GraphemeBreakProperties: array [UCS4Char] of TGraphemeBreakProperty;
+ GBPStats: array [TGraphemeBreakProperty] of record
+ Exists: Boolean;
+ Handled: Boolean;
+ MinValue: UCS4Char;
+ MaxValue: UCS4Char;
+ Count: LongInt;
+ Ranges: TRanges;
+ end;
+function ParseGraphemeBreakProperty(S: string): TGraphemeBreakProperty;
+ S := Trim(S);
+ case S of
+ 'Prepend':
+ Result := gbpPrepend;
+ 'CR':
+ Result := gbpCR;
+ 'LF':
+ Result := gbpLF;
+ 'Control':
+ Result := gbpControl;
+ 'Extend':
+ Result := gbpExtend;
+ 'Regional_Indicator':
+ Result := gpbRegional_Indicator;
+ 'SpacingMark':
+ Result := gbpSpacingMark;
+ 'L':
+ Result := gbpL;
+ 'V':
+ Result := gbpV;
+ 'T':
+ Result := gbpT;
+ 'LV':
+ Result := gbpLV;
+ 'LVT':
+ Result := gbpLVT;
+ 'E_Base':
+ Result := gbpE_Base;
+ 'E_Modifier':
+ Result := gbpE_Modifier;
+ 'ZWJ':
+ Result := gbpZWJ;
+ 'Glue_After_Zwj':
+ Result := gbpGlue_After_Zwj;
+ 'E_Base_GAZ':
+ Result := gbpE_Base_GAZ;
+ else
+ raise EArgumentException('Unknown grapheme break property: ''' + S + '''');
+ end;
+procedure ParseRange(S: string; out RangeLo, RangeHi: UCS4Char);
+ dp: SizeInt;
+ S := Trim(S);
+ dp := Pos('..', S);
+ if dp > 0 then
+ begin
+ RangeLo := StrToInt('$' + LeftStr(S, dp - 1));
+ RangeHi := StrToInt('$' + Copy(S, dp + 2, Length(S) - dp + 3));
+ end
+ else
+ begin
+ RangeLo := StrToInt('$' + S);
+ RangeHi := RangeLo;
+ end;
+procedure ParseGraphemeBreakProperties(const FileName: string);
+ InF: TextFile;
+ S: string;
+ SplitS: TStringArray;
+ LineNr: Integer = 0;
+ gbp: TGraphemeBreakProperty;
+ RangeLo, RangeHi, R: UCS4Char;
+ if not FileExists(FileName) then
+ begin
+ Writeln('File doesn''t exist: ', FileName);
+ Halt(1);
+ end;
+ AssignFile(InF, FileName);
+ Reset(InF);
+ while not EoF(InF) do
+ begin
+ Inc(LineNr);
+ Readln(InF, S);
+ S := Trim(S);
+ if Pos('#', S) > 0 then
+ S := LeftStr(S, Pos('#', S) - 1);
+ if S <> '' then
+ begin
+ SplitS := S.Split([';']);
+ if Length(SplitS) <> 2 then
+ raise Exception.Create('Invalid number of ; separators on line ' + IntToStr(LineNr));
+ ParseRange(SplitS[0], RangeLo, RangeHi);
+ gbp := ParseGraphemeBreakProperty(SplitS[1]);
+ for R := RangeLo to RangeHi do
+ GraphemeBreakProperties[R] := gbp;
+ end;
+ end;
+ CloseFile(InF);
+procedure CalcStatsAndRanges;
+ Ch: UCS4Char;
+ gbp, prev_gbp: TGraphemeBreakProperty;
+ FillChar(GBPStats, SizeOf(GBPStats), 0);
+ gbp := Low(TGraphemeBreakProperty);
+ for Ch := Low(UCS4Char) to High(UCS4Char) do
+ begin
+ prev_gbp := gbp;
+ gbp := GraphemeBreakProperties[Ch];
+ with GBPStats[gbp] do
+ begin
+ if not Exists then
+ begin
+ Exists := True;
+ MinValue := Ch;
+ MaxValue := Ch;
+ Count := 1;
+ SetLength(Ranges, 1);
+ Ranges[0].RangeLo := Ch;
+ Ranges[0].RangeHi := Ch;
+ end
+ else
+ begin
+ MaxValue := Ch;
+ Inc(Count);
+ if prev_gbp <> gbp then
+ begin
+ SetLength(Ranges, Length(Ranges) + 1);
+ with Ranges[High(Ranges)] do
+ begin
+ RangeLo := Ch;
+ RangeHi := Ch;
+ end;
+ end
+ else
+ Ranges[High(Ranges)].RangeHi := Ch;
+ end;
+ end;
+ end;
+procedure MaybeCoalesceRanges(RLo, RHi: UCS4Char);
+ gbp: TGraphemeBreakProperty;
+ RI: Integer;
+ for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+ if GBPStats[gbp].Exists and (not GBPStats[gbp].Handled) then
+ begin
+ for RI := 0 to High(GBPStats[gbp].Ranges) - 1 do
+ if (GBPStats[gbp].Ranges[RI].RangeHi = (RLo - 1)) and
+ (GBPStats[gbp].Ranges[RI + 1].RangeLo = (RHi + 1)) then
+ begin
+ GBPStats[gbp].Ranges[RI].RangeHi := GBPStats[gbp].Ranges[RI + 1].RangeHi;
+ Delete(GBPStats[gbp].Ranges, RI + 1, 1);
+ exit;
+ end;
+ end;
+function FindMinRangeCount: Integer;
+ gbp: TGraphemeBreakProperty;
+ Result := High(Integer);
+ for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+ if GBPStats[gbp].Exists and (not GBPStats[gbp].Handled) and (Length(GBPStats[gbp].Ranges) < Result) then
+ Result := Length(GBPStats[gbp].Ranges);
+function ApplyLV_LVTCompression: Boolean;
+ RangeLo = 44032;
+ RangeHi = 55203;
+ Ch: UCS4Char;
+ Result := False;
+ if (GBPStats[gbpLV].MinValue <> RangeLo) or (GBPStats[gbpLV].MaxValue <> (RangeHi - 27)) or
+ (GBPStats[gbpLVT].MinValue <> (RangeLo + 1)) or (GBPStats[gbpLVT].MaxValue <> RangeHi) then
+ exit;
+ for Ch := RangeLo to RangeHi do
+ begin
+ if ((Ch - RangeLo) mod 28) = 0 then
+ begin
+ if GraphemeBreakProperties[Ch] <> gbpLV then
+ exit;
+ end
+ else
+ begin
+ if GraphemeBreakProperties[Ch] <> gbpLVT then
+ exit;
+ end;
+ end;
+ Result := True;
+procedure GenCode(const OutFileName: string);
+ RangeCountThreshold = 30{400};
+ gbp: TGraphemeBreakProperty;
+ RI, NextRangeCount: Integer;
+ OutFile: TextFile;
+ Writeln('Generating file: ', OutFileName);
+ AssignFile(OutFile, OutFileName);
+ Rewrite(OutFile);
+ Writeln(OutFile, '{ do not edit, this file is autogenerated by the gbpparser tool }');
+ { unused properties are already handled }
+ for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+ if not GBPStats[gbp].Exists then
+ GBPStats[gbp].Handled := True;
+ { handle single codepoints first }
+ for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+ if (not GBPStats[gbp].Handled) and (GBPStats[gbp].Count = 1) then
+ begin
+ if GBPStats[gbp].MinValue <> GBPStats[gbp].MaxValue then
+ raise Exception.Create('Internal error');
+ Writeln(OutFile, 'if Ch=', GBPStats[gbp].MinValue, 'then result:=',gbp,' else');
+ GBPStats[gbp].Handled := True;
+ MaybeCoalesceRanges(GBPStats[gbp].MinValue, GBPStats[gbp].MaxValue);
+ end;
+ { handle single range codepoints next }
+ while FindMinRangeCount = 1 do
+ for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+ if (not GBPStats[gbp].Handled) and (Length(GBPStats[gbp].Ranges) = 1) then
+ begin
+ Writeln(OutFile, 'if(Ch>=', GBPStats[gbp].MinValue, ')and(Ch<=', GBPStats[gbp].MaxValue, ')then result:=',gbp,' else');
+ GBPStats[gbp].Handled := True;
+ MaybeCoalesceRanges(GBPStats[gbp].MinValue, GBPStats[gbp].MaxValue);
+ end;
+ if ApplyLV_LVTCompression then
+ begin
+ Writeln(OutFile, 'if(Ch>=44032)and(Ch<=55203)then begin if((Ch-44032)mod 28)=0then result:=gbpLV else result:=gbpLVT end else');
+ GBPStats[gbpLV].Handled := True;
+ GBPStats[gbpLVT].Handled := True;
+ end;
+ repeat
+ NextRangeCount := FindMinRangeCount;
+ if NextRangeCount <= RangeCountThreshold then
+ for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+ begin
+ if not GBPStats[gbp].Handled and (Length(GBPStats[gbp].Ranges) <= NextRangeCount) then
+ begin
+ GBPStats[gbp].Handled := True;
+ Write(OutFile, 'if');
+ for RI := 0 to High(GBPStats[gbp].Ranges) do
+ begin
+ if RI <> 0 then
+ Writeln(OutFile, 'or');
+ with GBPStats[gbp].Ranges[RI] do
+ begin
+ if RangeLo = RangeHi then
+ Write(OutFile, '(Ch=', RangeLo, ')')
+ else
+ Write(OutFile, '((Ch>=', RangeLo, ')and(Ch<=', RangeHi, '))');
+ MaybeCoalesceRanges(RangeLo, RangeHi);
+ end;
+ end;
+ Writeln(OutFile, 'then result:=',gbp,' else');
+ end;
+ end;
+ until NextRangeCount > RangeCountThreshold;
+ if NextRangeCount <> High(Integer) then
+ begin
+ //for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+ // if not GBPStats[gbp].Handled then
+ // Writeln(gbp, ' ', GBPStats[gbp].MinValue, '..', GBPStats[gbp].MaxValue, ' ', GBPStats[gbp].Count, ' ', Length(GBPStats[gbp].Ranges), ' ', (GBPStats[gbp].MaxValue - GBPStats[gbp].MinValue + 7) div 8);
+ Writeln(OutFile, 'case Ch of');
+ for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+ begin
+ if not GBPStats[gbp].Handled then
+ begin
+ GBPStats[gbp].Handled := True;
+ for RI := 0 to High(GBPStats[gbp].Ranges) do
+ begin
+ if RI <> 0 then
+ Writeln(OutFile, ',');
+ with GBPStats[gbp].Ranges[RI] do
+ begin
+ if RangeLo = RangeHi then
+ Write(OutFile, RangeLo)
+ else
+ Write(OutFile, RangeLo, '..', RangeHi);
+ end;
+ end;
+ Writeln(OutFile, ':result:=', gbp, ';');
+ end;
+ end;
+ Writeln(OutFile, 'else result:=gbpOther end');
+ end
+ else
+ Writeln(OutFile, 'result:=gbpOther');
+ CloseFile(OutFile);
+ FillChar(GraphemeBreakProperties, SizeOf(GraphemeBreakProperties), 0);
+ ParseGraphemeBreakProperties('data/UCD/auxiliary/GraphemeBreakProperty.txt');
+ CalcStatsAndRanges;
+ GenCode('');
+ Writeln('Done');