unicode.php 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. #!/usr/bin/php
  2. <?php
  3. /*
  4. * libsfn/unicode.php
  5. *
  6. * Copyright (C) 2020 bzt (bztsrc@gitlab)
  7. *
  8. * @brief small tool to generate unicode.h
  9. *
  10. * a simple search'n'replace would do on the texts, but we have to count undefined
  11. * code points in each block for proper code point coverage reports.
  12. *
  13. * See:
  14. * http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt
  15. * http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
  16. */
  17. // download files if they are missing
  18. foreach(["Blocks.txt", "UnicodeData.txt"] as $fn) {
  19. if(!file_exists($fn) || !filesize($fn)) {
  20. echo("Downloading UNICODE database ".$fn."... ");
  21. @file_put_contents($fn, @file_get_contents("http://www.unicode.org/Public/UCD/latest/ucd/".$fn));
  22. if(@filesize($fn) > 0) echo("OK\n"); else { @unlink($fn); die("ERROR!!!\n"); }
  23. }
  24. }
  25. // calculate
  26. echo("Counting UNICODE blocks... ");
  27. $blocks = [];
  28. foreach(file("Blocks.txt") as $line) {
  29. if(preg_match("/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\ (.+)/", $line, $m)) {
  30. $s = hexdec($m[1]); $e = hexdec($m[2]);
  31. if($s == 0xE000) $e = 0xEFFF;
  32. $blocks[] = [ $e-$s+1, $s, $e, $m[3] ];
  33. /* the upper part of Private Use Area is used for ligatures by SSFN */
  34. if($s == 0xE000)
  35. $blocks[] = [ 0xF8FF-0xF000+1, 0xF000, 0xF8FF, "Ligatures" ];
  36. }
  37. }
  38. foreach(file("UnicodeData.txt") as $line) {
  39. $l = explode(";",$line);
  40. $i = hexdec($l[0]);
  41. $j = substr($l[1],-6)=="First>"; /* blocks which have no character definitions at all */
  42. $u[]=[$i, $l[1][0]=='<' && !empty($l[10])? $l[10] : $l[1], $l[4]=="R" || $l[4]=="AL"];
  43. foreach($blocks as $k=>$b)
  44. if($i >= $b[1] && $i <= $b[2]) { if($j) $blocks[$k][0]=0; else $blocks[$k][0]--; }
  45. }
  46. // save output
  47. $s="/*
  48. * libsfn/unicode.h
  49. *
  50. * --- Generated from Blocks.txt and UnicodeData.txt by unicode.php ---
  51. *
  52. * Copyright (C) ".date("Y")." bzt (bztsrc@gitlab)
  53. *
  54. * Permission is hereby granted, free of charge, to any person
  55. * obtaining a copy of this software and associated documentation
  56. * files (the \"Software\"), to deal in the Software without
  57. * restriction, including without limitation the rights to use, copy,
  58. * modify, merge, publish, distribute, sublicense, and/or sell copies
  59. * of the Software, and to permit persons to whom the Software is
  60. * furnished to do so, subject to the following conditions:
  61. *
  62. * The above copyright notice and this permission notice shall be
  63. * included in all copies or substantial portions of the Software.
  64. *
  65. * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,
  66. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  67. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  68. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  69. * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  70. * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  71. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  72. * DEALINGS IN THE SOFTWARE.
  73. *
  74. * @brief UNICODE blocks and code point names
  75. *
  76. */
  77. /*** UNICODE blocks ***/
  78. typedef struct {
  79. int cnt;
  80. int undef;
  81. int start;
  82. int end;
  83. char *name;
  84. } unicode_block_t;
  85. #define UNICODE_NUMBLOCKS ".count($blocks)."
  86. #ifndef _UNICODE_BLOCKSDATA
  87. extern unicode_block_t ublocks[];
  88. #else
  89. unicode_block_t ublocks[] = {
  90. ";
  91. foreach($blocks as $k=>$b) {
  92. $s.=sprintf(" { 0,%3d, 0x%06x, 0x%06x, \"%s\" }",$b[0]<0||$b[0]>=$b[2]-$b[1]?0:$b[0],$b[1],$b[2],
  93. str_replace("\"","",$b[3])).(isset($blocks[$k+1])?",":"")."\n";
  94. }
  95. $s.="};\n#endif\n\n";
  96. $s.="/*** UNICODE code point names ***/
  97. #define tolowercase(X) (((X) >= 'A') && ((X) <= 'Z') ? ('a'+((X)-'A')) : (X))
  98. int unicmp(char *a, char *b);
  99. void uniname_free();
  100. int uniname(int unicode);
  101. char *utf8(int i);
  102. typedef struct {
  103. int unicode;
  104. int rtl;
  105. char *name;
  106. } uniname_t;
  107. #define UNICODE_NUMNAMES ".count($u)."
  108. #ifndef _UNICODE_NAMESDATA
  109. extern uniname_t uninames[UNICODE_NUMNAMES+1];
  110. extern char uniname_date[];
  111. #else
  112. uniname_t uninames[UNICODE_NUMNAMES+1];
  113. char uniname_date[] = \"".date("Y-m-d", filemtime("UnicodeData.txt"))."\";
  114. ";
  115. echo("OK\nCompressing UNICODE names... ");
  116. $u[0][1]="";
  117. $i=0; $N="";
  118. foreach($u as $k=>$n) {
  119. if($i < $n[0]) {
  120. while($i + 32768 < $n[0]) {
  121. $N.=pack("v", -32768);
  122. $i+=32768;
  123. }
  124. $N.=pack("v", -($n[0]-$i));
  125. }
  126. $N.=chr(intval($n[2]));
  127. $N.=str_replace("\'","\\\'",str_replace("\"","",$n[1])).chr(0);
  128. $i=$n[0] + 1;
  129. }
  130. $N = gzcompress($N, 9);
  131. $s.="#define UNICODE_DAT_SIZE ".strlen($N)."
  132. unsigned char unicode_dat[UNICODE_DAT_SIZE] = {";
  133. for($i = 0; $i < strlen($N); $i++)
  134. $s.=($i?",":"").sprintf("%d",ord($N[$i]));
  135. $s.="};\n#endif\n\n";
  136. echo("OK\n");
  137. file_put_contents("unicode.h", $s);