summaryrefslogtreecommitdiff
path: root/ext/mbstring/ucgendat/ucgendat.php
diff options
context:
space:
mode:
Diffstat (limited to 'ext/mbstring/ucgendat/ucgendat.php')
-rwxr-xr-xext/mbstring/ucgendat/ucgendat.php98
1 files changed, 88 insertions, 10 deletions
diff --git a/ext/mbstring/ucgendat/ucgendat.php b/ext/mbstring/ucgendat/ucgendat.php
index 8901a587b3..d1e887589f 100755
--- a/ext/mbstring/ucgendat/ucgendat.php
+++ b/ext/mbstring/ucgendat/ucgendat.php
@@ -4,10 +4,10 @@
/**
* This is based on the ucgendat.c file from the OpenLDAP project, licensed as
* follows. This file is not necessary to build PHP. It's only necessary to
- * rebuild unicode_data.h from Unicode ucd files.
+ * rebuild unicode_data.h and eaw_width.h from Unicode ucd files.
*
* Example usage:
- * php ucgendat.php UnicodeData.txt
+ * php ucgendat.php path/to/Unicode/data/files
*/
/* Copyright 1998-2007 The OpenLDAP Foundation.
@@ -45,7 +45,7 @@
if ($argc < 2) {
echo "Usage: php ucgendata.php ./datadir\n";
echo "./datadir must contain:\n";
- echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n";
+ echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n";
return;
}
@@ -54,8 +54,9 @@ $unicodeDataFile = $dir . '/UnicodeData.txt';
$caseFoldingFile = $dir . '/CaseFolding.txt';
$specialCasingFile = $dir . '/SpecialCasing.txt';
$derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
+$eastAsianWidthFile = $dir . '/EastAsianWidth.txt';
-$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile];
+$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile];
foreach ($files as $file) {
if (!file_exists($file)) {
echo "File $file does not exist.\n";
@@ -72,6 +73,11 @@ parseSpecialCasing($data, file_get_contents($specialCasingFile));
parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
file_put_contents($outputFile, generateData($data));
+$eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h";
+
+$eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile));
+file_put_contents($eawFile, generateEastAsianWidthData($eawData));
+
class Range {
public $start;
public $end;
@@ -372,6 +378,43 @@ function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
}
}
+function parseEastAsianWidth(string $input) : array {
+ $wideRanges = [];
+
+ foreach (parseDataFile($input) as $fields) {
+ if ($fields[1] == 'W' || $fields[1] == 'F') {
+ if ($dotsPos = strpos($fields[0], '..')) {
+ $startCode = intval(substr($fields[0], 0, $dotsPos), 16);
+ $endCode = intval(substr($fields[0], $dotsPos + 2), 16);
+
+ if (!empty($wideRanges)) {
+ $lastRange = $wideRanges[count($wideRanges) - 1];
+ if ($startCode == $lastRange->end + 1) {
+ $lastRange->end = $endCode;
+ continue;
+ }
+ }
+
+ $wideRanges[] = new Range($startCode, $endCode);
+ } else {
+ $code = intval($fields[0], 16);
+
+ if (!empty($wideRanges)) {
+ $lastRange = $wideRanges[count($wideRanges) - 1];
+ if ($code == $lastRange->end + 1) {
+ $lastRange->end++;
+ continue;
+ }
+ }
+
+ $wideRanges[] = new Range($code, $code);
+ }
+ }
+ }
+
+ return $wideRanges;
+}
+
function formatArray(array $values, int $width, string $format) : string {
$result = '';
$i = 0;
@@ -412,7 +455,7 @@ function generatePropData(UnicodeData $data) {
$propOffsets[] = $idx;
// TODO ucgendat.c pads the prop offsets to the next multiple of 4
- // for rather debious reasons of alignment. This should probably be
+ // for rather dubious reasons of alignment. This should probably be
// dropped
while (count($propOffsets) % 4 != 0) {
$propOffsets[] = 0;
@@ -509,17 +552,17 @@ function generateCaseData(UnicodeData $data) {
function generateData(UnicodeData $data) {
$result = <<<'HEADER'
-/* This file was generated from a modified version UCData's ucgendat.
+/* This file was generated from a modified version of UCData's ucgendat.
*
* DO NOT EDIT THIS FILE!
*
- * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
- * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
- * files from http://www.unicode.org/Public/ and run this program.
+ * Instead, download the appropriate UnicodeData-x.x.x.txt and
+ * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
+ * and run ext/mbstring/ucgendat/ucgendat.php.
*
* More information can be found in the UCData package. Unfortunately,
* the project's page doesn't seem to be live anymore, so you can use
- * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
+ * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
HEADER;
$result .= "\n\n" . generatePropData($data);
$result .= generateCaseData($data);
@@ -646,3 +689,38 @@ function generateMPH(array $map, bool $fast) {
return $mph;
}
+
+function generateEastAsianWidthData(array $wideRanges) {
+ $result = <<<'HEADER'
+/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
+ *
+ * DO NOT EDIT THIS FILE!
+ *
+ * East Asian Width table
+ *
+ * Some characters in East Asian languages are intended to be displayed in a space
+ * which is roughly square. (This contrasts with others such as the Latin alphabet,
+ * which are taller than they are wide.) To display these East Asian characters
+ * properly, twice the horizontal space is used. This must be taken into account
+ * when doing things like wrapping text to a specific width.
+ *
+ * Each pair of numbers in the below table is a range of Unicode codepoints
+ * which should be displayed as double-width.
+ */
+
+static const struct {
+ int begin;
+ int end;
+} mbfl_eaw_table[] = {
+
+HEADER;
+
+ foreach ($wideRanges as $range) {
+ $startCode = dechex($range->start);
+ $endCode = dechex($range->end);
+ $result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n";
+ }
+
+ $result .= "};\n";
+ return $result;
+}