1 files changed, 88 insertions, 10 deletions
diff --git a/ext/mbstring/ucgendat/ucgendat.php b/ext/mbstring/ucgendat/ucgendat.php
index 8901a587b3..d1e887589f 100755
--- a/ext/mbstring/ucgendat/ucgendat.php
+++ b/ext/mbstring/ucgendat/ucgendat.php
@@ -4,10 +4,10 @@
 /**
  * This is based on the ucgendat.c file from the OpenLDAP project, licensed as
  * follows. This file is not necessary to build PHP. It's only necessary to
- * rebuild unicode_data.h from Unicode ucd files.
+ * rebuild unicode_data.h and eaw_width.h from Unicode ucd files.
  *
  * Example usage:
- * php ucgendat.php UnicodeData.txt
+ * php ucgendat.php path/to/Unicode/data/files
  */
 
 /* Copyright 1998-2007 The OpenLDAP Foundation.
@@ -45,7 +45,7 @@
 if ($argc < 2) {
     echo "Usage: php ucgendata.php ./datadir\n";
     echo "./datadir must contain:\n";
-    echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n";
+    echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n";
     return;
 }
 
@@ -54,8 +54,9 @@ $unicodeDataFile = $dir . '/UnicodeData.txt';
 $caseFoldingFile = $dir . '/CaseFolding.txt';
 $specialCasingFile = $dir . '/SpecialCasing.txt';
 $derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
+$eastAsianWidthFile = $dir . '/EastAsianWidth.txt';
 
-$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile];
+$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile];
 foreach ($files as $file) {
     if (!file_exists($file)) {
         echo "File $file does not exist.\n";
@@ -72,6 +73,11 @@ parseSpecialCasing($data, file_get_contents($specialCasingFile));
 parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
 file_put_contents($outputFile, generateData($data));
 
+$eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h";
+
+$eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile));
+file_put_contents($eawFile, generateEastAsianWidthData($eawData));
+
 class Range {
     public $start;
     public $end;
@@ -372,6 +378,43 @@ function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
     }
 }
 
+function parseEastAsianWidth(string $input) : array {
+    $wideRanges = [];
+
+    foreach (parseDataFile($input) as $fields) {
+        if ($fields[1] == 'W' || $fields[1] == 'F') {
+            if ($dotsPos = strpos($fields[0], '..')) {
+                $startCode = intval(substr($fields[0], 0, $dotsPos), 16);
+                $endCode = intval(substr($fields[0], $dotsPos + 2), 16);
+
+                if (!empty($wideRanges)) {
+                    $lastRange = $wideRanges[count($wideRanges) - 1];
+                    if ($startCode == $lastRange->end + 1) {
+                        $lastRange->end = $endCode;
+                        continue;
+                    }
+                }
+
+                $wideRanges[] = new Range($startCode, $endCode);
+            } else {
+                $code = intval($fields[0], 16);
+
+                if (!empty($wideRanges)) {
+                    $lastRange = $wideRanges[count($wideRanges) - 1];
+                    if ($code == $lastRange->end + 1) {
+                        $lastRange->end++;
+                        continue;
+                    }
+                }
+
+                $wideRanges[] = new Range($code, $code);
+            }
+        }
+    }
+
+    return $wideRanges;
+}
+
 function formatArray(array $values, int $width, string $format) : string {
     $result = '';
     $i = 0;
@@ -412,7 +455,7 @@ function generatePropData(UnicodeData $data) {
     $propOffsets[] = $idx;
 
     // TODO ucgendat.c pads the prop offsets to the next multiple of 4
-    // for rather debious reasons of alignment. This should probably be
+    // for rather dubious reasons of alignment. This should probably be
     // dropped
     while (count($propOffsets) % 4 != 0) {
         $propOffsets[] = 0;
@@ -509,17 +552,17 @@ function generateCaseData(UnicodeData $data) {
 
 function generateData(UnicodeData $data) {
     $result = <<<'HEADER'
-/* This file was generated from a modified version UCData's ucgendat.
+/* This file was generated from a modified version of UCData's ucgendat.
  *
  *                     DO NOT EDIT THIS FILE!
  *
- * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
- * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
- * files from  http://www.unicode.org/Public/ and run this program.
+ * Instead, download the appropriate UnicodeData-x.x.x.txt and
+ * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
+ * and run ext/mbstring/ucgendat/ucgendat.php.
  *
  * More information can be found in the UCData package. Unfortunately,
  * the project's page doesn't seem to be live anymore, so you can use
- * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
+ * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
 HEADER;
     $result .= "\n\n" . generatePropData($data);
     $result .= generateCaseData($data);
@@ -646,3 +689,38 @@ function generateMPH(array $map, bool $fast) {
 
     return $mph;
 }
+
+function generateEastAsianWidthData(array $wideRanges) {
+      $result = <<<'HEADER'
+/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
+ *
+ *                     DO NOT EDIT THIS FILE!
+ *
+ * East Asian Width table
+ *
+ * Some characters in East Asian languages are intended to be displayed in a space
+ * which is roughly square. (This contrasts with others such as the Latin alphabet,
+ * which are taller than they are wide.) To display these East Asian characters
+ * properly, twice the horizontal space is used. This must be taken into account
+ * when doing things like wrapping text to a specific width.
+ *
+ * Each pair of numbers in the below table is a range of Unicode codepoints
+ * which should be displayed as double-width.
+ */
+
+static const struct {
+	int begin;
+	int end;
+} mbfl_eaw_table[] = {
+
+HEADER;
+
+    foreach ($wideRanges as $range) {
+        $startCode = dechex($range->start);
+        $endCode = dechex($range->end);
+        $result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n";
+    }
+
+    $result .= "};\n";
+    return $result;
+}