bidi.c 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686
  1. /*
  2. * Implementation of the Unicode bidirectional and Arabic shaping
  3. * algorithms for PuTTY.
  4. *
  5. * Original version written and kindly contributed to this code base
  6. * by Ahmad Khalifa of Arabeyes. The bidi part was almost completely
  7. * rewritten in 2021 by Simon Tatham to bring it up to date, but the
  8. * shaping part is still the one by the original authors.
  9. *
  10. * Implementation notes:
  11. *
  12. * Algorithm version
  13. * -----------------
  14. *
  15. * This algorithm is up to date with Unicode Standard Annex #9
  16. * revision 46:
  17. *
  18. * https://www.unicode.org/reports/tr9/tr9-46.html
  19. *
  20. * and passes the full conformance test suite in Unicode 15.0.0.
  21. *
  22. * Paragraph and line handling
  23. * ---------------------------
  24. *
  25. * The full Unicode bidi algorithm expects to receive text containing
  26. * multiple paragraphs, together with a decision about how those
  27. * paragraphs are broken up into lines. It calculates embedding levels
  28. * a whole paragraph at a time without considering the line breaks,
  29. * but then the final reordering of the text for display is done to
  30. * each _line_ independently based on the levels computed for the text
  31. * in that line.
  32. *
  33. * This algorithm omits all of that, because it's intended for use as
  34. * a display-time transformation of a text terminal, which doesn't
  35. * preserve enough semantic information to decide what's a paragraph
  36. * break and what is not. So a piece of input text provided to this
  37. * algorithm is always expected to consist of exactly one paragraph
  38. * *and* exactly one line.
  39. *
  40. * Embeddings, overrides and isolates
  41. * ----------------------------------
  42. *
  43. * This implementation has full support for all the Unicode special
  44. * control characters that modify bidi behaviour, such as
  45. *
  46. * U+202A LEFT-TO-RIGHT EMBEDDING
  47. * U+202B RIGHT-TO-LEFT EMBEDDING
  48. * U+202D LEFT-TO-RIGHT OVERRIDE
  49. * U+202E RIGHT-TO-LEFT OVERRIDE
  50. * U+202C POP DIRECTIONAL FORMATTING
  51. * U+2068 FIRST STRONG ISOLATE
  52. * U+2066 LEFT-TO-RIGHT ISOLATE
  53. * U+2067 RIGHT-TO-LEFT ISOLATE
  54. * U+2069 POP DIRECTIONAL ISOLATE
  55. *
  56. * However, at present, the terminal emulator that is a client of this
  57. * code has no way to pass those in (because they're dropped during
  58. * escape sequence processing and don't get stored in the terminal
  59. * state). Nonetheless, the code is all here, so if the terminal
  60. * emulator becomes able to record those characters at some later
  61. * point, we'll be all set to take account of them during bidi.
  62. *
  63. * But the _main_ purpose of supporting the full bidi algorithm is
  64. * simply that that's the easiest way to be sure it's correct, because
  65. * if you support the whole thing, you can run the full conformance
  66. * test suite. (And I don't 100% believe that restricting to the
  67. * subset of _tests_ valid with a reduced character set will test the
  68. * full set of _functionality_ relevant to the reduced set.)
  69. *
  70. * Retained formatting characters
  71. * ------------------------------
  72. *
  73. * The standard bidi algorithm, in step X9, deletes assorted
  74. * formatting characters from the text: all the embedding and override
  75. * section initiator characters, the Pop Directional Formatting
  76. * character that closes one of those sections again, and any
  77. * character labelled as Boundary Neutral. So the characters it
  78. * returns are not a _full_ reordering of the input; some input
  79. * characters vanish completely.
  80. *
  81. * This would be fine, if it were not for the fact that - as far as I
  82. * can see - _exactly one_ Unicode code point in the discarded
  83. * category has a wcwidth() of more than 0, namely U+00AD SOFT HYPHEN
  84. * which is a printing character for terminal purposes but has a bidi
  85. * class of BN.
  86. *
  87. * Therefore, we must implement a modified version of the algorithm,
  88. * as described in section 5.2 of TR9, which retains those formatting
  89. * characters so that a client can find out where they ended up in the
  90. * reordering.
  91. *
  92. * Section 5.2 describes a set of modifications to the algorithm that
  93. * are _intended_ to achieve this without changing the rest of the
  94. * behaviour: that is, if you take the output of the modified
  95. * algorithm and delete all the characters that the standard algorithm
  96. * would have removed, you should end up with the remaining characters
  97. * in the same order that the standard algorithm would have delivered.
  98. * However, section 5.2 admits the possibility of error, and says "in
  99. * case of any deviation the explicit algorithm is the normative
  100. * statement for conformance". And indeed, in one or two places I
  101. * found I had to make my own tweaks to the section 5.2 description in
  102. * order to get the whole test suite to pass, because I think the 5.2
  103. * modifications if taken literally don't quite achieve that. My
  104. * justification is that sentence of 5.2: in case of doubt, the right
  105. * thing is to make the code behave the same as the official
  106. * algorithm.
  107. *
  108. * It's possible that there might still be some undiscovered
  109. * discrepancies between the behaviour of the standard and modified
  110. * algorithms. So, just in case, I've kept in this code the ability to
  111. * implement the _standard_ algorithm too! If you compile with
  112. * -DREMOVE_FORMATTING_CHARS, this code should go back to implementing
  113. * the literal UAX#9 bidi algorithm - so you can run your suspect
  114. * input through both versions, making it much easier to figure out
  115. * why they differ, and in which of the many stages of the algorithm
  116. * the difference was introduced.
  117. *
  118. * However, beware that when compiling in this mode, the do_bidi
  119. * interface to the terminal will stop working, and just abort() when
  120. * called! The only useful thing you can do with this mode is to run
  121. * the companion program bidi_test.c.
  122. */
  123. #include <stdlib.h> /* definition of wchar_t */
  124. #include "putty.h"
  125. #include "misc.h"
  126. #include "bidi.h"
  127. typedef struct {
  128. char type;
  129. wchar_t form_b;
  130. } shape_node;
  131. /* Kept near the actual table, for verification. */
  132. #define SHAPE_FIRST 0x621
  133. #define SHAPE_LAST (SHAPE_FIRST + lenof(shapetypes) - 1)
  134. static const shape_node shapetypes[] = {
  135. /* index, Typ, Iso, Ligature Index*/
  136. /* 621 */ {SU, 0xFE80},
  137. /* 622 */ {SR, 0xFE81},
  138. /* 623 */ {SR, 0xFE83},
  139. /* 624 */ {SR, 0xFE85},
  140. /* 625 */ {SR, 0xFE87},
  141. /* 626 */ {SD, 0xFE89},
  142. /* 627 */ {SR, 0xFE8D},
  143. /* 628 */ {SD, 0xFE8F},
  144. /* 629 */ {SR, 0xFE93},
  145. /* 62A */ {SD, 0xFE95},
  146. /* 62B */ {SD, 0xFE99},
  147. /* 62C */ {SD, 0xFE9D},
  148. /* 62D */ {SD, 0xFEA1},
  149. /* 62E */ {SD, 0xFEA5},
  150. /* 62F */ {SR, 0xFEA9},
  151. /* 630 */ {SR, 0xFEAB},
  152. /* 631 */ {SR, 0xFEAD},
  153. /* 632 */ {SR, 0xFEAF},
  154. /* 633 */ {SD, 0xFEB1},
  155. /* 634 */ {SD, 0xFEB5},
  156. /* 635 */ {SD, 0xFEB9},
  157. /* 636 */ {SD, 0xFEBD},
  158. /* 637 */ {SD, 0xFEC1},
  159. /* 638 */ {SD, 0xFEC5},
  160. /* 639 */ {SD, 0xFEC9},
  161. /* 63A */ {SD, 0xFECD},
  162. /* 63B */ {SU, 0x0},
  163. /* 63C */ {SU, 0x0},
  164. /* 63D */ {SU, 0x0},
  165. /* 63E */ {SU, 0x0},
  166. /* 63F */ {SU, 0x0},
  167. /* 640 */ {SC, 0x0},
  168. /* 641 */ {SD, 0xFED1},
  169. /* 642 */ {SD, 0xFED5},
  170. /* 643 */ {SD, 0xFED9},
  171. /* 644 */ {SD, 0xFEDD},
  172. /* 645 */ {SD, 0xFEE1},
  173. /* 646 */ {SD, 0xFEE5},
  174. /* 647 */ {SD, 0xFEE9},
  175. /* 648 */ {SR, 0xFEED},
  176. /* 649 */ {SR, 0xFEEF}, /* SD */
  177. /* 64A */ {SD, 0xFEF1},
  178. /* 64B */ {SU, 0x0},
  179. /* 64C */ {SU, 0x0},
  180. /* 64D */ {SU, 0x0},
  181. /* 64E */ {SU, 0x0},
  182. /* 64F */ {SU, 0x0},
  183. /* 650 */ {SU, 0x0},
  184. /* 651 */ {SU, 0x0},
  185. /* 652 */ {SU, 0x0},
  186. /* 653 */ {SU, 0x0},
  187. /* 654 */ {SU, 0x0},
  188. /* 655 */ {SU, 0x0},
  189. /* 656 */ {SU, 0x0},
  190. /* 657 */ {SU, 0x0},
  191. /* 658 */ {SU, 0x0},
  192. /* 659 */ {SU, 0x0},
  193. /* 65A */ {SU, 0x0},
  194. /* 65B */ {SU, 0x0},
  195. /* 65C */ {SU, 0x0},
  196. /* 65D */ {SU, 0x0},
  197. /* 65E */ {SU, 0x0},
  198. /* 65F */ {SU, 0x0},
  199. /* 660 */ {SU, 0x0},
  200. /* 661 */ {SU, 0x0},
  201. /* 662 */ {SU, 0x0},
  202. /* 663 */ {SU, 0x0},
  203. /* 664 */ {SU, 0x0},
  204. /* 665 */ {SU, 0x0},
  205. /* 666 */ {SU, 0x0},
  206. /* 667 */ {SU, 0x0},
  207. /* 668 */ {SU, 0x0},
  208. /* 669 */ {SU, 0x0},
  209. /* 66A */ {SU, 0x0},
  210. /* 66B */ {SU, 0x0},
  211. /* 66C */ {SU, 0x0},
  212. /* 66D */ {SU, 0x0},
  213. /* 66E */ {SU, 0x0},
  214. /* 66F */ {SU, 0x0},
  215. /* 670 */ {SU, 0x0},
  216. /* 671 */ {SR, 0xFB50},
  217. /* 672 */ {SU, 0x0},
  218. /* 673 */ {SU, 0x0},
  219. /* 674 */ {SU, 0x0},
  220. /* 675 */ {SU, 0x0},
  221. /* 676 */ {SU, 0x0},
  222. /* 677 */ {SU, 0x0},
  223. /* 678 */ {SU, 0x0},
  224. /* 679 */ {SD, 0xFB66},
  225. /* 67A */ {SD, 0xFB5E},
  226. /* 67B */ {SD, 0xFB52},
  227. /* 67C */ {SU, 0x0},
  228. /* 67D */ {SU, 0x0},
  229. /* 67E */ {SD, 0xFB56},
  230. /* 67F */ {SD, 0xFB62},
  231. /* 680 */ {SD, 0xFB5A},
  232. /* 681 */ {SU, 0x0},
  233. /* 682 */ {SU, 0x0},
  234. /* 683 */ {SD, 0xFB76},
  235. /* 684 */ {SD, 0xFB72},
  236. /* 685 */ {SU, 0x0},
  237. /* 686 */ {SD, 0xFB7A},
  238. /* 687 */ {SD, 0xFB7E},
  239. /* 688 */ {SR, 0xFB88},
  240. /* 689 */ {SU, 0x0},
  241. /* 68A */ {SU, 0x0},
  242. /* 68B */ {SU, 0x0},
  243. /* 68C */ {SR, 0xFB84},
  244. /* 68D */ {SR, 0xFB82},
  245. /* 68E */ {SR, 0xFB86},
  246. /* 68F */ {SU, 0x0},
  247. /* 690 */ {SU, 0x0},
  248. /* 691 */ {SR, 0xFB8C},
  249. /* 692 */ {SU, 0x0},
  250. /* 693 */ {SU, 0x0},
  251. /* 694 */ {SU, 0x0},
  252. /* 695 */ {SU, 0x0},
  253. /* 696 */ {SU, 0x0},
  254. /* 697 */ {SU, 0x0},
  255. /* 698 */ {SR, 0xFB8A},
  256. /* 699 */ {SU, 0x0},
  257. /* 69A */ {SU, 0x0},
  258. /* 69B */ {SU, 0x0},
  259. /* 69C */ {SU, 0x0},
  260. /* 69D */ {SU, 0x0},
  261. /* 69E */ {SU, 0x0},
  262. /* 69F */ {SU, 0x0},
  263. /* 6A0 */ {SU, 0x0},
  264. /* 6A1 */ {SU, 0x0},
  265. /* 6A2 */ {SU, 0x0},
  266. /* 6A3 */ {SU, 0x0},
  267. /* 6A4 */ {SD, 0xFB6A},
  268. /* 6A5 */ {SU, 0x0},
  269. /* 6A6 */ {SD, 0xFB6E},
  270. /* 6A7 */ {SU, 0x0},
  271. /* 6A8 */ {SU, 0x0},
  272. /* 6A9 */ {SD, 0xFB8E},
  273. /* 6AA */ {SU, 0x0},
  274. /* 6AB */ {SU, 0x0},
  275. /* 6AC */ {SU, 0x0},
  276. /* 6AD */ {SD, 0xFBD3},
  277. /* 6AE */ {SU, 0x0},
  278. /* 6AF */ {SD, 0xFB92},
  279. /* 6B0 */ {SU, 0x0},
  280. /* 6B1 */ {SD, 0xFB9A},
  281. /* 6B2 */ {SU, 0x0},
  282. /* 6B3 */ {SD, 0xFB96},
  283. /* 6B4 */ {SU, 0x0},
  284. /* 6B5 */ {SU, 0x0},
  285. /* 6B6 */ {SU, 0x0},
  286. /* 6B7 */ {SU, 0x0},
  287. /* 6B8 */ {SU, 0x0},
  288. /* 6B9 */ {SU, 0x0},
  289. /* 6BA */ {SR, 0xFB9E},
  290. /* 6BB */ {SD, 0xFBA0},
  291. /* 6BC */ {SU, 0x0},
  292. /* 6BD */ {SU, 0x0},
  293. /* 6BE */ {SD, 0xFBAA},
  294. /* 6BF */ {SU, 0x0},
  295. /* 6C0 */ {SR, 0xFBA4},
  296. /* 6C1 */ {SD, 0xFBA6},
  297. /* 6C2 */ {SU, 0x0},
  298. /* 6C3 */ {SU, 0x0},
  299. /* 6C4 */ {SU, 0x0},
  300. /* 6C5 */ {SR, 0xFBE0},
  301. /* 6C6 */ {SR, 0xFBD9},
  302. /* 6C7 */ {SR, 0xFBD7},
  303. /* 6C8 */ {SR, 0xFBDB},
  304. /* 6C9 */ {SR, 0xFBE2},
  305. /* 6CA */ {SU, 0x0},
  306. /* 6CB */ {SR, 0xFBDE},
  307. /* 6CC */ {SD, 0xFBFC},
  308. /* 6CD */ {SU, 0x0},
  309. /* 6CE */ {SU, 0x0},
  310. /* 6CF */ {SU, 0x0},
  311. /* 6D0 */ {SU, 0x0},
  312. /* 6D1 */ {SU, 0x0},
  313. /* 6D2 */ {SR, 0xFBAE},
  314. };
  315. /*
  316. * Returns the bidi character type of ch.
  317. */
  318. unsigned char bidi_getType(int ch)
  319. {
  320. static const struct {
  321. int first, last, type;
  322. } lookup[] = {
  323. #include "unicode/bidi_type.h"
  324. };
  325. int i, j, k;
  326. i = -1;
  327. j = lenof(lookup);
  328. while (j - i > 1) {
  329. k = (i + j) / 2;
  330. if (ch < lookup[k].first)
  331. j = k;
  332. else if (ch > lookup[k].last)
  333. i = k;
  334. else
  335. return lookup[k].type;
  336. }
  337. /*
  338. * If we reach here, the character was not in any of the
  339. * intervals listed in the lookup table. This means we return
  340. * ON (`Other Neutrals'). This is the appropriate code for any
  341. * character genuinely not listed in the Unicode table, and
  342. * also the table above has deliberately left out any
  343. * characters _explicitly_ listed as ON (to save space!).
  344. */
  345. return ON;
  346. }
  347. /*
  348. * Return the mirrored version of a glyph.
  349. *
  350. * FIXME: there are also glyphs which the text rendering engine is
  351. * supposed to display left-right reflected, since no mirrored glyph
  352. * exists in Unicode itself to indicate the reflected form. Those are
  353. * listed in comments in BidiMirroring.txt. Many of them are
  354. * mathematical, e.g. the square root sign, or set difference
  355. * operator, or integral sign. No API currently exists here to
  356. * communicate the need for that reflected display back to the client.
  357. */
  358. static unsigned mirror_glyph(unsigned int ch)
  359. {
  360. static const struct {
  361. unsigned src, dst;
  362. } mirror_pairs[] = {
  363. #include "unicode/bidi_mirror.h"
  364. };
  365. int i, j, k;
  366. i = -1;
  367. j = lenof(mirror_pairs);
  368. while (j - i > 1) {
  369. k = (i + j) / 2;
  370. if (ch < mirror_pairs[k].src)
  371. j = k;
  372. else if (ch > mirror_pairs[k].src)
  373. i = k;
  374. else
  375. return mirror_pairs[k].dst;
  376. }
  377. return ch;
  378. }
  379. /*
  380. * Identify the bracket characters treated specially by bidi rule
  381. * BD19, and return their paired character(s).
  382. */
  383. typedef enum { BT_NONE, BT_OPEN, BT_CLOSE } BracketType;
  384. typedef struct BracketTypeData {
  385. unsigned partner, equiv_partner;
  386. BracketType type;
  387. } BracketTypeData;
  388. static BracketTypeData bracket_type(unsigned int ch)
  389. {
  390. static const struct {
  391. unsigned src;
  392. BracketTypeData payload;
  393. } bracket_pairs[] = {
  394. #include "unicode/bidi_brackets.h"
  395. };
  396. int i, j, k;
  397. i = -1;
  398. j = lenof(bracket_pairs);
  399. while (j - i > 1) {
  400. k = (i + j) / 2;
  401. if (ch < bracket_pairs[k].src) {
  402. j = k;
  403. } else if (ch > bracket_pairs[k].src) {
  404. i = k;
  405. } else {
  406. return bracket_pairs[k].payload;
  407. }
  408. }
  409. static const BracketTypeData null = { 0, 0, BT_NONE };
  410. return null;
  411. }
  412. /*
  413. * Function exported to front ends to allow them to identify
  414. * bidi-active characters (in case, for example, the platform's
  415. * text display function can't conveniently be prevented from doing
  416. * its own bidi and so special treatment is required for characters
  417. * that would cause the bidi algorithm to activate).
  418. *
  419. * This function is passed a single Unicode code point, and returns
  420. * nonzero if the presence of this code point can possibly cause
  421. * the bidi algorithm to do any reordering. Thus, any string
  422. * composed entirely of characters for which is_rtl() returns zero
  423. * should be safe to pass to a bidi-active platform display
  424. * function without fear.
  425. *
  426. * (is_rtl() must therefore also return true for any character
  427. * which would be affected by Arabic shaping, but this isn't
  428. * important because all such characters are right-to-left so it
  429. * would have flagged them anyway.)
  430. */
  431. bool is_rtl(int c)
  432. {
  433. return typeIsBidiActive(bidi_getType(c));
  434. }
  435. /* The Main shaping function, and the only one to be used
  436. * by the outside world.
  437. *
  438. * line: buffer to apply shaping to. this must be passed by doBidi() first
  439. * to: output buffer for the shaped data
  440. * count: number of characters in line
  441. */
  442. int do_shape(bidi_char *line, bidi_char *to, int count)
  443. {
  444. int i, tempShape;
  445. bool ligFlag = false;
  446. for (i=0; i<count; i++) {
  447. to[i] = line[i];
  448. tempShape = STYPE(line[i].wc);
  449. switch (tempShape) {
  450. case SC:
  451. break;
  452. case SU:
  453. break;
  454. case SR:
  455. tempShape = (i+1 < count ? STYPE(line[i+1].wc) : SU);
  456. if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
  457. to[i].wc = SFINAL((SISOLATED(line[i].wc)));
  458. else
  459. to[i].wc = SISOLATED(line[i].wc);
  460. break;
  461. case SD:
  462. /* Make Ligatures */
  463. tempShape = (i+1 < count ? STYPE(line[i+1].wc) : SU);
  464. if (line[i].wc == 0x644) {
  465. if (i > 0) switch (line[i-1].wc) {
  466. case 0x622:
  467. ligFlag = true;
  468. if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
  469. to[i].wc = 0xFEF6;
  470. else
  471. to[i].wc = 0xFEF5;
  472. break;
  473. case 0x623:
  474. ligFlag = true;
  475. if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
  476. to[i].wc = 0xFEF8;
  477. else
  478. to[i].wc = 0xFEF7;
  479. break;
  480. case 0x625:
  481. ligFlag = true;
  482. if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
  483. to[i].wc = 0xFEFA;
  484. else
  485. to[i].wc = 0xFEF9;
  486. break;
  487. case 0x627:
  488. ligFlag = true;
  489. if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
  490. to[i].wc = 0xFEFC;
  491. else
  492. to[i].wc = 0xFEFB;
  493. break;
  494. }
  495. if (ligFlag) {
  496. to[i-1].wc = 0x20;
  497. ligFlag = false;
  498. break;
  499. }
  500. }
  501. if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC)) {
  502. tempShape = (i > 0 ? STYPE(line[i-1].wc) : SU);
  503. if ((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
  504. to[i].wc = SMEDIAL((SISOLATED(line[i].wc)));
  505. else
  506. to[i].wc = SFINAL((SISOLATED(line[i].wc)));
  507. break;
  508. }
  509. tempShape = (i > 0 ? STYPE(line[i-1].wc) : SU);
  510. if ((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
  511. to[i].wc = SINITIAL((SISOLATED(line[i].wc)));
  512. else
  513. to[i].wc = SISOLATED(line[i].wc);
  514. break;
  515. }
  516. }
  517. return 1;
  518. }
  519. typedef enum { DO_NEUTRAL, DO_LTR, DO_RTL } DirectionalOverride;
  520. typedef struct DSStackEntry {
  521. /*
  522. * An entry in the directional status stack (rule section X).
  523. */
  524. unsigned char level;
  525. bool isolate;
  526. DirectionalOverride override;
  527. } DSStackEntry;
  528. typedef struct BracketStackEntry {
  529. /*
  530. * An entry in the bracket-pair-tracking stack (rule BD16).
  531. */
  532. unsigned ch;
  533. size_t c;
  534. } BracketStackEntry;
  535. typedef struct IsolatingRunSequence {
  536. size_t start, end;
  537. BidiType sos, eos, embeddingDirection;
  538. } IsolatingRunSequence;
  539. #define MAX_DEPTH 125 /* specified in the standard */
  540. struct BidiContext {
  541. /*
  542. * Storage space preserved between runs, all allocated to the same
  543. * length (internal_array_sizes).
  544. */
  545. size_t internal_array_sizes;
  546. BidiType *types, *origTypes;
  547. unsigned char *levels;
  548. size_t *irsindices, *bracketpos;
  549. bool *irsdone;
  550. /*
  551. * Separately allocated with its own size field
  552. */
  553. IsolatingRunSequence *irslist;
  554. size_t irslistsize;
  555. /*
  556. * Rewritten to point to the input to the currently active run of
  557. * the bidi algorithm
  558. */
  559. bidi_char *text;
  560. size_t textlen;
  561. /*
  562. * State within a run of the algorithm
  563. */
  564. BidiType paragraphOverride;
  565. DSStackEntry dsstack[MAX_DEPTH + 2];
  566. size_t ds_sp;
  567. size_t overflowIsolateCount, overflowEmbeddingCount, validIsolateCount;
  568. unsigned char paragraphLevel;
  569. size_t *irs;
  570. size_t irslen;
  571. BidiType sos, eos, embeddingDirection;
  572. BracketStackEntry bstack[63]; /* constant size specified in rule BD16 */
  573. };
  574. BidiContext *bidi_new_context(void)
  575. {
  576. BidiContext *ctx = snew(BidiContext);
  577. memset(ctx, 0, sizeof(BidiContext));
  578. return ctx;
  579. }
  580. void bidi_free_context(BidiContext *ctx)
  581. {
  582. sfree(ctx->types);
  583. sfree(ctx->origTypes);
  584. sfree(ctx->levels);
  585. sfree(ctx->irsindices);
  586. sfree(ctx->irsdone);
  587. sfree(ctx->bracketpos);
  588. sfree(ctx->irslist);
  589. sfree(ctx);
  590. }
  591. static void ensure_arrays(BidiContext *ctx, size_t textlen)
  592. {
  593. if (textlen <= ctx->internal_array_sizes)
  594. return;
  595. ctx->internal_array_sizes = textlen;
  596. ctx->types = sresize(ctx->types, ctx->internal_array_sizes, BidiType);
  597. ctx->origTypes = sresize(ctx->origTypes, ctx->internal_array_sizes,
  598. BidiType);
  599. ctx->levels = sresize(ctx->levels, ctx->internal_array_sizes,
  600. unsigned char);
  601. ctx->irsindices = sresize(ctx->irsindices, ctx->internal_array_sizes,
  602. size_t);
  603. ctx->irsdone = sresize(ctx->irsdone, ctx->internal_array_sizes, bool);
  604. ctx->bracketpos = sresize(ctx->bracketpos, ctx->internal_array_sizes,
  605. size_t);
  606. }
  607. static void setup_types(BidiContext *ctx)
  608. {
  609. for (size_t i = 0; i < ctx->textlen; i++)
  610. ctx->types[i] = ctx->origTypes[i] = bidi_getType(ctx->text[i].wc);
  611. }
  612. static bool text_needs_bidi(BidiContext *ctx)
  613. {
  614. /*
  615. * Initial optimisation: check for any bidi-active character at
  616. * all in an input line. If there aren't any, we can skip the
  617. * whole algorithm.
  618. *
  619. * Also include the paragraph override in this check!
  620. */
  621. for (size_t i = 0; i < ctx->textlen; i++)
  622. if (typeIsBidiActive(ctx->types[i]))
  623. return true;
  624. return typeIsBidiActive(ctx->paragraphOverride);
  625. }
  626. static size_t find_matching_pdi(const BidiType *types, size_t i, size_t size)
  627. {
  628. /* Assuming that types[i] is an isolate initiator, find its
  629. * matching PDI by rule BD9. */
  630. unsigned counter = 1;
  631. i++;
  632. for (; i < size; i++) {
  633. BidiType t = types[i];
  634. if (typeIsIsolateInitiator(t)) {
  635. counter++;
  636. } else if (t == PDI) {
  637. counter--;
  638. if (counter == 0)
  639. return i;
  640. }
  641. }
  642. /* If no PDI was found, return the length of the array. */
  643. return size;
  644. }
  645. static unsigned char rule_p2_p3(const BidiType *types, size_t size)
  646. {
  647. /*
  648. * Rule P2. Find the first strong type (L, R or AL), ignoring
  649. * anything inside an isolated segment.
  650. *
  651. * Rule P3. If that type is R or AL, choose a paragraph embeddding
  652. * level of 1, otherwise 0.
  653. */
  654. for (size_t i = 0; i < size; i++) {
  655. BidiType t = types[i];
  656. if (typeIsIsolateInitiator(t))
  657. i = find_matching_pdi(types, i, size);
  658. else if (typeIsStrong(t))
  659. return (t == L ? 0 : 1);
  660. }
  661. return 0; /* default if no strong type found */
  662. }
  663. static void set_paragraph_level(BidiContext *ctx)
  664. {
  665. if (ctx->paragraphOverride == L)
  666. ctx->paragraphLevel = 0;
  667. else if (ctx->paragraphOverride == R)
  668. ctx->paragraphLevel = 1;
  669. else
  670. ctx->paragraphLevel = rule_p2_p3(ctx->types, ctx->textlen);
  671. }
  672. static inline unsigned char nextOddLevel(unsigned char x) { return (x+1)|1; }
  673. static inline unsigned char nextEvenLevel(unsigned char x) { return (x|1)+1; }
  674. static inline void push(BidiContext *ctx, unsigned char level,
  675. DirectionalOverride override, bool isolate)
  676. {
  677. ctx->ds_sp++;
  678. assert(ctx->ds_sp < lenof(ctx->dsstack));
  679. ctx->dsstack[ctx->ds_sp].level = level;
  680. ctx->dsstack[ctx->ds_sp].override = override;
  681. ctx->dsstack[ctx->ds_sp].isolate = isolate;
  682. }
  683. static inline void pop(BidiContext *ctx)
  684. {
  685. assert(ctx->ds_sp > 0);
  686. ctx->ds_sp--;
  687. }
  688. static void process_explicit_embeddings(BidiContext *ctx)
  689. {
  690. /*
  691. * Rule X1 initialisation.
  692. */
  693. ctx->ds_sp = (size_t)-1;
  694. push(ctx, ctx->paragraphLevel, DO_NEUTRAL, false);
  695. ctx->overflowIsolateCount = 0;
  696. ctx->overflowEmbeddingCount = 0;
  697. ctx->validIsolateCount = 0;
  698. #define stk (&ctx->dsstack[ctx->ds_sp])
  699. for (size_t i = 0; i < ctx->textlen; i++) {
  700. BidiType t = ctx->types[i];
  701. switch (t) {
  702. case RLE: case LRE: case RLO: case LRO: {
  703. /* Rules X2-X5 */
  704. unsigned char newLevel;
  705. DirectionalOverride override;
  706. #ifndef REMOVE_FORMATTING_CHARS
  707. ctx->levels[i] = stk->level;
  708. #endif
  709. switch (t) {
  710. case RLE: /* rule X2 */
  711. newLevel = nextOddLevel(stk->level);
  712. override = DO_NEUTRAL;
  713. break;
  714. case LRE: /* rule X3 */
  715. newLevel = nextEvenLevel(stk->level);
  716. override = DO_NEUTRAL;
  717. break;
  718. case RLO: /* rule X4 */
  719. newLevel = nextOddLevel(stk->level);
  720. override = DO_RTL;
  721. break;
  722. case LRO: /* rule X5 */
  723. newLevel = nextEvenLevel(stk->level);
  724. override = DO_LTR;
  725. break;
  726. default:
  727. unreachable("how did this get past the outer switch?");
  728. }
  729. if (newLevel <= MAX_DEPTH &&
  730. ctx->overflowIsolateCount == 0 &&
  731. ctx->overflowEmbeddingCount == 0) {
  732. /* Embedding code is valid. Push a stack entry. */
  733. push(ctx, newLevel, override, false);
  734. } else {
  735. /* Embedding code is an overflow one. */
  736. if (ctx->overflowIsolateCount == 0)
  737. ctx->overflowEmbeddingCount++;
  738. }
  739. break;
  740. }
  741. case RLI: case LRI: case FSI: {
  742. /* Rules X5a, X5b, X5c */
  743. if (t == FSI) {
  744. /* Rule X5c: decide whether this should be treated
  745. * like RLI or LRI */
  746. size_t pdi = find_matching_pdi(ctx->types, i, ctx->textlen);
  747. unsigned char level = rule_p2_p3(ctx->types + (i + 1),
  748. pdi - (i + 1));
  749. t = (level == 1 ? RLI : LRI);
  750. }
  751. ctx->levels[i] = stk->level;
  752. if (stk->override != DO_NEUTRAL)
  753. ctx->types[i] = (stk->override == DO_LTR ? L :
  754. stk->override == DO_RTL ? R : t);
  755. unsigned char newLevel = (t == RLI ? nextOddLevel(stk->level) :
  756. nextEvenLevel(stk->level));
  757. if (newLevel <= MAX_DEPTH &&
  758. ctx->overflowIsolateCount == 0 &&
  759. ctx->overflowEmbeddingCount == 0) {
  760. /* Isolate code is valid. Push a stack entry. */
  761. push(ctx, newLevel, DO_NEUTRAL, true);
  762. ctx->validIsolateCount++;
  763. } else {
  764. /* Isolate code is an overflow one. */
  765. ctx->overflowIsolateCount++;
  766. }
  767. break;
  768. }
  769. case PDI: {
  770. /* Rule X6a */
  771. if (ctx->overflowIsolateCount > 0) {
  772. ctx->overflowIsolateCount--;
  773. } else if (ctx->validIsolateCount == 0) {
  774. /* Do nothing: spurious isolate-pop */
  775. } else {
  776. /* Valid isolate-pop. We expect that the stack must
  777. * therefore contain at least one isolate==true entry,
  778. * so pop everything up to and including it. */
  779. ctx->overflowEmbeddingCount = 0;
  780. while (!stk->isolate)
  781. pop(ctx);
  782. pop(ctx);
  783. ctx->validIsolateCount--;
  784. }
  785. ctx->levels[i] = stk->level;
  786. if (stk->override != DO_NEUTRAL)
  787. ctx->types[i] = (stk->override == DO_LTR ? L : R);
  788. break;
  789. }
  790. case PDF: {
  791. /* Rule X7 */
  792. if (ctx->overflowIsolateCount > 0) {
  793. /* Do nothing if we've overflowed on isolates */
  794. } else if (ctx->overflowEmbeddingCount > 0) {
  795. ctx->overflowEmbeddingCount--;
  796. } else if (ctx->ds_sp > 0 && !stk->isolate) {
  797. pop(ctx);
  798. } else {
  799. /* Do nothing: spurious embedding-pop */
  800. }
  801. #ifndef REMOVE_FORMATTING_CHARS
  802. ctx->levels[i] = stk->level;
  803. #endif
  804. break;
  805. }
  806. case B: {
  807. /* Rule X8: if an explicit paragraph separator appears in
  808. * this text at all then it does not participate in any of
  809. * the above, and just gets assigned the paragraph level.
  810. *
  811. * PS, it had better be right at the end of the text,
  812. * because we have not implemented rule P1 in this code. */
  813. assert(i == ctx->textlen - 1);
  814. ctx->levels[i] = ctx->paragraphLevel;
  815. break;
  816. }
  817. case BN: {
  818. /*
  819. * The section 5.2 adjustment to rule X6 says that we
  820. * apply it to BN just like any other class. But I think
  821. * this can't possibly give the same results as the
  822. * unmodified algorithm.
  823. *
  824. * Proof: adding RLO BN or LRO BN at the end of a
  825. * paragraph should not change the output of the standard
  826. * algorithm, because the override doesn't affect the BN
  827. * in rule X6, and then rule X9 removes both. But with the
  828. * modified rule X6, the BN is changed into R or L, and
  829. * then rule X9 doesn't remove it, and then you've added a
  830. * strong type that will set eos for the level run just
  831. * before the override. And whatever the standard
  832. * algorithm set eos to, _one_ of these override sequences
  833. * will disagree with it.
  834. *
  835. * So I think we just set the BN's level, and don't change
  836. * its type.
  837. */
  838. ctx->levels[i] = stk->level;
  839. break;
  840. }
  841. default: {
  842. /* Rule X6. */
  843. ctx->levels[i] = stk->level;
  844. if (stk->override != DO_NEUTRAL)
  845. ctx->types[i] = (stk->override == DO_LTR ? L : R);
  846. break;
  847. }
  848. }
  849. }
  850. #undef stk
  851. }
  852. static void remove_embedding_characters(BidiContext *ctx)
  853. {
  854. #ifndef REMOVE_FORMATTING_CHARS
  855. /*
  856. * Rule X9, as modified by section 5.2: turn embedding (but not
  857. * isolate) characters into BN.
  858. */
  859. for (size_t i = 0; i < ctx->textlen; i++) {
  860. BidiType t = ctx->types[i];
  861. if (typeIsRemovedDuringProcessing(t)) {
  862. ctx->types[i] = BN;
  863. /*
  864. * My own adjustment to the section 5.2 mods: a sequence
  865. * of contiguous BN generated by this setup should never
  866. * be at different levels from each other.
  867. *
  868. * An example where this goes wrong is if you open two
  869. * LREs in sequence, then close them again:
  870. *
  871. * ... LRE LRE PDF PDF ...
  872. *
  873. * The initial level assignment gives level 0 to the outer
  874. * LRE/PDF pair, and level 2 to the inner one. The
  875. * standard algorithm would remove all four, so this
  876. * doesn't matter, and you end up with no break in the
  877. * surrounding level run. But if you just rewrite the
  878. * types of all those characters to BN and leave the
  879. * levels in that state, then the modified algorithm will
  880. * leave the middle two BN at level 2, dividing what
  881. * should have been a long level run at level 0 into two
  882. * separate ones.
  883. */
  884. if (i > 0 && ctx->types[i-1] == BN)
  885. ctx->levels[i] = ctx->levels[i-1];
  886. }
  887. }
  888. #else
  889. /*
  890. * Rule X9, original version: completely remove embedding
  891. * start/end characters and also boundary neutrals.
  892. */
  893. size_t outpos = 0;
  894. for (size_t i = 0; i < ctx->textlen; i++) {
  895. BidiType t = ctx->types[i];
  896. if (!typeIsRemovedDuringProcessing(t)) {
  897. ctx->text[outpos] = ctx->text[i];
  898. ctx->levels[outpos] = ctx->levels[i];
  899. ctx->types[outpos] = ctx->types[i];
  900. ctx->origTypes[outpos] = ctx->origTypes[i];
  901. outpos++;
  902. }
  903. }
  904. ctx->textlen = outpos;
  905. #endif
  906. }
  907. typedef void (*irs_fn_t)(BidiContext *ctx);
  908. static void find_isolating_run_sequences(BidiContext *ctx, irs_fn_t process)
  909. {
  910. /*
  911. * Rule X10 / BD13. Now that we've assigned an embedding level to
  912. * each character in the text, we have to divide the text into
  913. * subsequences on which to do the next stage of processing.
  914. *
  915. * In earlier issues of the bidi algorithm, these subsequences
  916. * were contiguous in the original text, and each one was a 'level
  917. * run': a maximal contiguous subsequence of characters all at the
  918. * same embedding level.
  919. *
  920. * But now we have isolates, and the point of an (isolate
  921. * initiator ... PDI) sequence is that the whole sequence should
  922. * be treated like a single BN for the purposes of formatting
  923. * everything outside it. As a result, we now have to recombine
  924. * our level runs into longer sequences, on the principle that if
  925. * a level run ends with an isolate initiator, then we bring it
  926. * together with whatever later level run starts with the matching
  927. * PDI.
  928. *
  929. * These subsequences are no longer contiguous (the whole point is
  930. * that between the isolate initiator and the PDI is some other
  931. * text that we've skipped over). They're called 'isolating run
  932. * sequences'.
  933. */
  934. memset(ctx->irsdone, 0, ctx->textlen);
  935. size_t i = 0;
  936. size_t n_irs = 0;
  937. size_t indexpos = 0;
  938. while (i < ctx->textlen) {
  939. if (ctx->irsdone[i]) {
  940. i++;
  941. continue;
  942. }
  943. /*
  944. * Found a character not already processed. Start a new
  945. * sequence here.
  946. */
  947. sgrowarray(ctx->irslist, ctx->irslistsize, n_irs);
  948. IsolatingRunSequence *irs = &ctx->irslist[n_irs++];
  949. irs->start = indexpos;
  950. size_t j = i;
  951. size_t irslevel = ctx->levels[i];
  952. while (j < ctx->textlen) {
  953. /*
  954. * We expect that all level runs in this sequence will be
  955. * at the same level as each other, by construction of how
  956. * we set up the levels from the isolates in the first
  957. * place.
  958. */
  959. assert(ctx->levels[j] == irslevel);
  960. do {
  961. ctx->irsdone[j] = true;
  962. ctx->irsindices[indexpos++] = j++;
  963. } while (j < ctx->textlen && ctx->levels[j] == irslevel);
  964. if (!typeIsIsolateInitiator(ctx->types[j-1]))
  965. break; /* this IRS is ended */
  966. j = find_matching_pdi(ctx->types, j-1, ctx->textlen);
  967. }
  968. irs->end = indexpos;
  969. /*
  970. * Determine the start-of-sequence and end-of-sequence types
  971. * for this sequence.
  972. *
  973. * These depend on the embedding levels of surrounding text.
  974. * But processing each run can change those levels. That's why
  975. * we have to use a two-pass strategy here, first identifying
  976. * all the isolating run sequences using the input level data,
  977. * and not processing any of them until we know where they all
  978. * are.
  979. */
  980. size_t p;
  981. unsigned char level_inside, level_outside, level_max;
  982. p = i;
  983. level_inside = ctx->levels[p];
  984. level_outside = ctx->paragraphLevel;
  985. while (p > 0) {
  986. p--;
  987. if (ctx->types[p] != BN) {
  988. level_outside = ctx->levels[p];
  989. break;
  990. }
  991. }
  992. level_max = max(level_inside, level_outside);
  993. irs->sos = (level_max % 2 ? R : L);
  994. p = ctx->irsindices[irs->end - 1];
  995. level_inside = ctx->levels[p];
  996. level_outside = ctx->paragraphLevel;
  997. if (typeIsIsolateInitiator(ctx->types[p])) {
  998. /* Special case: if an isolating run sequence ends in an
  999. * unmatched isolate initiator, then level_outside is
  1000. * taken to be the paragraph embedding level and the
  1001. * loop below is skipped. */
  1002. } else {
  1003. while (p+1 < ctx->textlen) {
  1004. p++;
  1005. if (ctx->types[p] != BN) {
  1006. level_outside = ctx->levels[p];
  1007. break;
  1008. }
  1009. }
  1010. }
  1011. level_max = max(level_inside, level_outside);
  1012. irs->eos = (level_max % 2 ? R : L);
  1013. irs->embeddingDirection = (irslevel % 2 ? R : L);
  1014. /*
  1015. * Now we've listed in ctx->irsindices[] the index of every
  1016. * character that's part of this isolating run sequence, and
  1017. * recorded an entry in irslist containing the interval of
  1018. * indices relevant to this IRS, plus its assorted metadata.
  1019. * We've also marked those locations in the input text as done
  1020. * in ctx->irsdone, so that we'll skip over them when the
  1021. * outer iteration reaches them later.
  1022. */
  1023. }
  1024. for (size_t k = 0; k < n_irs; k++) {
  1025. IsolatingRunSequence *irs = &ctx->irslist[k];
  1026. ctx->irs = ctx->irsindices + irs->start;
  1027. ctx->irslen = irs->end - irs->start;
  1028. ctx->sos = irs->sos;
  1029. ctx->eos = irs->eos;
  1030. ctx->embeddingDirection = irs->embeddingDirection;
  1031. process(ctx);
  1032. }
  1033. /* Reset irslen to 0 when we've finished. This means any other
  1034. * functions that absentmindedly try to use irslen at all will end
  1035. * up doing nothing at all, which should be easier to detect and
  1036. * debug than if they run on subtly the wrong subset of the
  1037. * text. */
  1038. ctx->irslen = 0;
  1039. }
  1040. static void remove_nsm(BidiContext *ctx)
  1041. {
  1042. /* Rule W1: NSM gains the type of the previous character, or sos
  1043. * at the start of the run, with the exception that isolation
  1044. * boundaries turn into ON. */
  1045. BidiType prevType = ctx->sos;
  1046. for (size_t c = 0; c < ctx->irslen; c++) {
  1047. size_t i = ctx->irs[c];
  1048. BidiType t = ctx->types[i];
  1049. if (t == NSM) {
  1050. ctx->types[i] = prevType;
  1051. } else if (typeIsIsolateInitiatorOrPDI(t)) {
  1052. prevType = ON;
  1053. #ifndef REMOVE_FORMATTING_CHARS
  1054. } else if (t == BN) {
  1055. /* section 5.2 adjustment: these don't affect prevType */
  1056. #endif
  1057. } else {
  1058. prevType = t;
  1059. }
  1060. }
  1061. }
  1062. static void change_en_to_an(BidiContext *ctx)
  1063. {
  1064. /* Rule W2: EN becomes AN if the previous strong type is AL. (The
  1065. * spec says that the 'previous strong type' is counted as sos at
  1066. * the start of the run, although it hardly matters, since sos
  1067. * can't be AL.) */
  1068. BidiType prevStrongType = ctx->sos;
  1069. for (size_t c = 0; c < ctx->irslen; c++) {
  1070. size_t i = ctx->irs[c];
  1071. BidiType t = ctx->types[i];
  1072. if (t == EN && prevStrongType == AL) {
  1073. ctx->types[i] = AN;
  1074. } else if (typeIsStrong(t)) {
  1075. prevStrongType = t;
  1076. }
  1077. }
  1078. }
  1079. static void change_al_to_r(BidiContext *ctx)
  1080. {
  1081. /* Rule W3: AL becomes R unconditionally. (The only difference
  1082. * between the two types was their effect on nearby numbers, which
  1083. * was dealt with in rule W2, so now we're done with the
  1084. * distinction.) */
  1085. for (size_t c = 0; c < ctx->irslen; c++) {
  1086. size_t i = ctx->irs[c];
  1087. if (ctx->types[i] == AL)
  1088. ctx->types[i] = R;
  1089. }
  1090. }
  1091. static void eliminate_separators_between_numbers(BidiContext *ctx)
  1092. {
  1093. /* Rule W4: a single numeric separator between two numbers of the
  1094. * same type compatible with that separator takes the type of the
  1095. * number. ES is a separator type compatible only with EN; CS is a
  1096. * separator type compatible with either EN or AN.
  1097. *
  1098. * Section 5.2 adjustment: intervening BNs do not break this, so
  1099. * instead of simply looking at types[irs[c-1]] and types[irs[c+1]],
  1100. * we must track the last three indices we saw that were not BN. */
  1101. size_t i1 = 0, i2 = 0;
  1102. BidiType t0 = ON, t1 = ON, t2 = ON;
  1103. for (size_t c = 0; c < ctx->irslen; c++) {
  1104. size_t i = ctx->irs[c];
  1105. BidiType t = ctx->types[i];
  1106. #ifndef REMOVE_FORMATTING_CHARS
  1107. if (t == BN)
  1108. continue;
  1109. #endif
  1110. i1 = i2; i2 = i;
  1111. t0 = t1; t1 = t2; t2 = t;
  1112. if (t0 == t2 && ((t1 == ES && t0 == EN) ||
  1113. (t1 == CS && (t0 == EN || t0 == AN)))) {
  1114. ctx->types[i1] = t0;
  1115. }
  1116. }
  1117. }
  1118. static void eliminate_et_next_to_en(BidiContext *ctx)
  1119. {
  1120. /* Rule W5: a sequence of ET adjacent to an EN take the type EN.
  1121. * This is easiest to implement with one loop in each direction.
  1122. *
  1123. * Section 5.2 adjustment: include BN with ET. (We don't need to
  1124. * #ifdef that out, because in the standard algorithm, we won't
  1125. * have any BN left in any case.) */
  1126. bool modifying = false;
  1127. for (size_t c = 0; c < ctx->irslen; c++) {
  1128. size_t i = ctx->irs[c];
  1129. BidiType t = ctx->types[i];
  1130. if (t == EN) {
  1131. modifying = true;
  1132. } else if (modifying && typeIsETOrBN(t)) {
  1133. ctx->types[i] = EN;
  1134. } else {
  1135. modifying = false;
  1136. }
  1137. }
  1138. for (size_t c = ctx->irslen; c-- > 0 ;) {
  1139. size_t i = ctx->irs[c];
  1140. BidiType t = ctx->types[i];
  1141. if (t == EN) {
  1142. modifying = true;
  1143. } else if (modifying && typeIsETOrBN(t)) {
  1144. ctx->types[i] = EN;
  1145. } else {
  1146. modifying = false;
  1147. }
  1148. }
  1149. }
  1150. static void eliminate_separators_and_terminators(BidiContext *ctx)
  1151. {
  1152. /* Rule W6: all separators and terminators change to ON.
  1153. *
  1154. * (The spec is not quite clear on which bidi types are included
  1155. * in this; one assumes ES, ET and CS, but what about S? I _think_
  1156. * the answer is that this is a rule in the W section, so it's
  1157. * implicitly supposed to only apply to types designated as weakly
  1158. * directional, so not S.) */
  1159. #ifndef REMOVE_FORMATTING_CHARS
  1160. /*
  1161. * Section 5.2 adjustment: this also applies to any BN adjacent on
  1162. * either side to one of these types, which is easiest to
  1163. * implement with a separate double-loop converting those to an
  1164. * arbitrary one of the affected types, say CS.
  1165. *
  1166. * This double loop can be completely skipped in the standard
  1167. * algorithm.
  1168. */
  1169. bool modifying = false;
  1170. for (size_t c = 0; c < ctx->irslen; c++) {
  1171. size_t i = ctx->irs[c];
  1172. BidiType t = ctx->types[i];
  1173. if (typeIsWeakSeparatorOrTerminator(t)) {
  1174. modifying = true;
  1175. } else if (modifying && t == BN) {
  1176. ctx->types[i] = CS;
  1177. } else {
  1178. modifying = false;
  1179. }
  1180. }
  1181. for (size_t c = ctx->irslen; c-- > 0 ;) {
  1182. size_t i = ctx->irs[c];
  1183. BidiType t = ctx->types[i];
  1184. if (typeIsWeakSeparatorOrTerminator(t)) {
  1185. modifying = true;
  1186. } else if (modifying && t == BN) {
  1187. ctx->types[i] = CS;
  1188. } else {
  1189. modifying = false;
  1190. }
  1191. }
  1192. #endif
  1193. /* Now the main part of rule W6 */
  1194. for (size_t c = 0; c < ctx->irslen; c++) {
  1195. size_t i = ctx->irs[c];
  1196. BidiType t = ctx->types[i];
  1197. if (typeIsWeakSeparatorOrTerminator(t))
  1198. ctx->types[i] = ON;
  1199. }
  1200. }
  1201. static void change_en_to_l(BidiContext *ctx)
  1202. {
  1203. /* Rule W7: EN becomes L if the previous strong type (or sos) is L. */
  1204. BidiType prevStrongType = ctx->sos;
  1205. for (size_t c = 0; c < ctx->irslen; c++) {
  1206. size_t i = ctx->irs[c];
  1207. BidiType t = ctx->types[i];
  1208. if (t == EN && prevStrongType == L) {
  1209. ctx->types[i] = L;
  1210. } else if (typeIsStrong(t)) {
  1211. prevStrongType = t;
  1212. }
  1213. }
  1214. }
  1215. typedef void (*bracket_pair_fn)(BidiContext *ctx, size_t copen, size_t cclose);
  1216. static void find_bracket_pairs(BidiContext *ctx, bracket_pair_fn process)
  1217. {
  1218. const size_t NO_BRACKET = ~(size_t)0;
  1219. /*
  1220. * Rule BD16.
  1221. */
  1222. size_t sp = 0;
  1223. for (size_t c = 0; c < ctx->irslen; c++)
  1224. ctx->bracketpos[c] = NO_BRACKET;
  1225. for (size_t c = 0; c < ctx->irslen; c++) {
  1226. size_t i = ctx->irs[c];
  1227. unsigned wc = ctx->text[i].wc;
  1228. BracketTypeData bt = bracket_type(wc);
  1229. if (bt.type == BT_OPEN) {
  1230. if (sp >= lenof(ctx->bstack)) {
  1231. /*
  1232. * Stack overflow. The spec says we simply give up at
  1233. * this point.
  1234. */
  1235. goto found_all_pairs;
  1236. }
  1237. ctx->bstack[sp].ch = wc;
  1238. ctx->bstack[sp].c = c;
  1239. sp++;
  1240. } else if (bt.type == BT_CLOSE) {
  1241. size_t new_sp = sp;
  1242. /*
  1243. * Search up the stack for an entry containing a matching
  1244. * open bracket. If we find it, pop that entry and
  1245. * everything deeper, and record a matching pair. If we
  1246. * reach the bottom of the stack without finding anything,
  1247. * leave sp where it started.
  1248. */
  1249. while (new_sp-- > 0) {
  1250. if (ctx->bstack[new_sp].ch == bt.partner ||
  1251. ctx->bstack[new_sp].ch == bt.equiv_partner) {
  1252. /* Found a stack element matching this one */
  1253. size_t cstart = ctx->bstack[new_sp].c;
  1254. ctx->bracketpos[cstart] = c;
  1255. sp = new_sp;
  1256. break;
  1257. }
  1258. }
  1259. }
  1260. }
  1261. found_all_pairs:
  1262. for (size_t c = 0; c < ctx->irslen; c++) {
  1263. if (ctx->bracketpos[c] != NO_BRACKET) {
  1264. process(ctx, c, ctx->bracketpos[c]);
  1265. }
  1266. }
  1267. }
  1268. static BidiType get_bracket_type(BidiContext *ctx, size_t copen, size_t cclose)
  1269. {
  1270. /*
  1271. * Rule N0: a pair of matched brackets containing at least one
  1272. * strong type takes on the current embedding direction, unless
  1273. * all of these are true at once:
  1274. *
  1275. * (a) there are no strong types inside the brackets matching the
  1276. * current embedding direction
  1277. * (b) there _is_ at least one strong type inside the brackets
  1278. * that is _opposite_ to the current embedding direction
  1279. * (c) the strong type preceding the open bracket is also
  1280. * opposite to the current embedding direction
  1281. *
  1282. * in which case they take on the opposite direction.
  1283. *
  1284. * For these purposes, number types (EN and AN) count as R.
  1285. */
  1286. bool foundOppositeTypeInside = false;
  1287. for (size_t c = copen + 1; c < cclose; c++) {
  1288. size_t i = ctx->irs[c];
  1289. BidiType t = ctx->types[i];
  1290. if (typeIsStrongOrNumber(t)) {
  1291. t = t == L ? L : R; /* numbers count as R */
  1292. if (t == ctx->embeddingDirection) {
  1293. /* Found something inside the brackets matching the
  1294. * current level, so (a) is violated. */
  1295. return ctx->embeddingDirection;
  1296. } else {
  1297. foundOppositeTypeInside = true;
  1298. }
  1299. }
  1300. }
  1301. if (!foundOppositeTypeInside) {
  1302. /* No strong types at all inside the brackets, so return ON to
  1303. * indicate that we're not messing with their type at all. */
  1304. return ON;
  1305. }
  1306. /* There was an opposite strong type in the brackets. Look
  1307. * backwards to the preceding strong type, and go with that,
  1308. * whichever it is. */
  1309. for (size_t c = copen; c-- > 0 ;) {
  1310. size_t i = ctx->irs[c];
  1311. BidiType t = ctx->types[i];
  1312. if (typeIsStrongOrNumber(t)) {
  1313. t = t == L ? L : R; /* numbers count as R */
  1314. return t;
  1315. }
  1316. }
  1317. /* Fallback: if the preceding strong type was not found, go with
  1318. * sos. */
  1319. return ctx->sos;
  1320. }
  1321. static void reset_bracket_type(BidiContext *ctx, size_t c, BidiType t)
  1322. {
  1323. /* Final bullet point of rule N0: when we change the type of a
  1324. * bracket, the same change applies to any contiguous sequence of
  1325. * characters after it whose _original_ bidi type was NSM. */
  1326. do {
  1327. ctx->types[ctx->irs[c++]] = t;
  1328. #ifndef REMOVE_FORMATTING_CHARS
  1329. while (c < ctx->irslen && ctx->origTypes[ctx->irs[c]] == BN) {
  1330. /* Section 5.2 adjustment: skip past BN in the process. */
  1331. c++;
  1332. }
  1333. #endif
  1334. } while (c < ctx->irslen && ctx->origTypes[ctx->irs[c]] == NSM);
  1335. }
  1336. static void resolve_brackets(BidiContext *ctx, size_t copen, size_t cclose)
  1337. {
  1338. if (typeIsNeutral(ctx->types[ctx->irs[copen]]) &&
  1339. typeIsNeutral(ctx->types[ctx->irs[cclose]])) {
  1340. BidiType t = get_bracket_type(ctx, copen, cclose);
  1341. if (t != ON) {
  1342. reset_bracket_type(ctx, copen, t);
  1343. reset_bracket_type(ctx, cclose, t);
  1344. }
  1345. }
  1346. }
  1347. static void remove_ni(BidiContext *ctx)
  1348. {
  1349. /*
  1350. * Rules N1 and N2 together: neutral or isolate characters take
  1351. * the direction of the surrounding strong text if the nearest
  1352. * strong characters on each side match, and otherwise, they take
  1353. * the embedding direction.
  1354. */
  1355. const size_t NO_INDEX = ~(size_t)0;
  1356. BidiType prevStrongType = ctx->sos;
  1357. size_t c_ni_start = NO_INDEX;
  1358. for (size_t c = 0; c <= ctx->irslen; c++) {
  1359. BidiType t;
  1360. if (c < ctx->irslen) {
  1361. size_t i = ctx->irs[c];
  1362. t = ctx->types[i];
  1363. } else {
  1364. /* One extra loop iteration, using eos to resolve the
  1365. * final sequence of NI if any */
  1366. t = ctx->eos;
  1367. }
  1368. if (typeIsStrongOrNumber(t)) {
  1369. t = t == L ? L : R; /* numbers count as R */
  1370. if (c_ni_start != NO_INDEX) {
  1371. /* There are some NI we have to fix up */
  1372. BidiType ni_type = (t == prevStrongType ? t :
  1373. ctx->embeddingDirection);
  1374. for (size_t c2 = c_ni_start; c2 < c; c2++) {
  1375. size_t i2 = ctx->irs[c2];
  1376. BidiType t2 = ctx->types[i2];
  1377. if (typeIsNeutralOrIsolate(t2))
  1378. ctx->types[i2] = ni_type;
  1379. }
  1380. }
  1381. prevStrongType = t;
  1382. c_ni_start = NO_INDEX;
  1383. } else if (typeIsNeutralOrIsolate(t) && c_ni_start == NO_INDEX) {
  1384. c_ni_start = c;
  1385. }
  1386. }
  1387. }
  1388. static void resolve_implicit_levels(BidiContext *ctx)
  1389. {
  1390. /* Rules I1 and I2 */
  1391. for (size_t c = 0; c < ctx->irslen; c++) {
  1392. size_t i = ctx->irs[c];
  1393. unsigned char level = ctx->levels[i];
  1394. BidiType t = ctx->types[i];
  1395. if (level % 2 == 0) {
  1396. /* Rule I1 */
  1397. if (t == R)
  1398. ctx->levels[i] += 1;
  1399. else if (t == AN || t == EN)
  1400. ctx->levels[i] += 2;
  1401. } else {
  1402. /* Rule I2 */
  1403. if (t == L || t == AN || t == EN)
  1404. ctx->levels[i] += 1;
  1405. }
  1406. }
  1407. }
  1408. static void process_isolating_run_sequence(BidiContext *ctx)
  1409. {
  1410. /* Section W: resolve weak types */
  1411. remove_nsm(ctx);
  1412. change_en_to_an(ctx);
  1413. change_al_to_r(ctx);
  1414. eliminate_separators_between_numbers(ctx);
  1415. eliminate_et_next_to_en(ctx);
  1416. eliminate_separators_and_terminators(ctx);
  1417. change_en_to_l(ctx);
  1418. /* Section N: resolve neutral types (and isolates) */
  1419. find_bracket_pairs(ctx, resolve_brackets);
  1420. remove_ni(ctx);
  1421. /* Section I: resolve implicit levels */
  1422. resolve_implicit_levels(ctx);
  1423. }
  1424. static void reset_whitespace_and_separators(BidiContext *ctx)
  1425. {
  1426. /*
  1427. * Rule L1: segment and paragraph separators, plus whitespace
  1428. * preceding them, all reset to the paragraph embedding level.
  1429. * This also applies to whitespace at the very end.
  1430. *
  1431. * This is done using the original types, not the versions that
  1432. * the rest of this algorithm has been merrily mutating.
  1433. */
  1434. bool modifying = true;
  1435. for (size_t i = ctx->textlen; i-- > 0 ;) {
  1436. BidiType t = ctx->origTypes[i];
  1437. if (typeIsSegmentOrParaSeparator(t)) {
  1438. ctx->levels[i] = ctx->paragraphLevel;
  1439. modifying = true;
  1440. } else if (modifying) {
  1441. if (typeIsWhitespaceOrIsolate(t)) {
  1442. ctx->levels[i] = ctx->paragraphLevel;
  1443. } else if (!typeIsRemovedDuringProcessing(t)) {
  1444. modifying = false;
  1445. }
  1446. }
  1447. }
  1448. #ifndef REMOVE_FORMATTING_CHARS
  1449. /*
  1450. * Section 5.2 adjustment: types removed by rule X9 take the level
  1451. * of the character to their left.
  1452. */
  1453. for (size_t i = 0; i < ctx->textlen; i++) {
  1454. BidiType t = ctx->origTypes[i];
  1455. if (typeIsRemovedDuringProcessing(t)) {
  1456. /* Section 5.2 adjustment */
  1457. ctx->levels[i] = (i > 0 ? ctx->levels[i-1] : ctx->paragraphLevel);
  1458. }
  1459. }
  1460. #endif /* ! REMOVE_FORMATTING_CHARS */
  1461. }
  1462. static void reverse(BidiContext *ctx, size_t start, size_t end)
  1463. {
  1464. for (size_t i = start, j = end; i < j; i++, j--) {
  1465. bidi_char tmp = ctx->text[i];
  1466. ctx->text[i] = ctx->text[j];
  1467. ctx->text[j] = tmp;
  1468. }
  1469. }
  1470. static void mirror_glyphs(BidiContext *ctx)
  1471. {
  1472. /*
  1473. * Rule L3: any character with a mirror-image pair at an odd
  1474. * embedding level is replaced by its mirror image.
  1475. *
  1476. * This is specified in the standard as happening _after_ rule L2
  1477. * (the actual reordering of the text). But it's much easier to
  1478. * implement it before, while our levels[] array still matches up
  1479. * to the text order.
  1480. */
  1481. for (size_t i = 0; i < ctx->textlen; i++) {
  1482. if (ctx->levels[i] % 2)
  1483. ctx->text[i].wc = mirror_glyph(ctx->text[i].wc);
  1484. }
  1485. }
  1486. static void reverse_sequences(BidiContext *ctx)
  1487. {
  1488. /*
  1489. * Rule L2: every maximal contiguous sequence of characters at a
  1490. * given level or higher is reversed.
  1491. */
  1492. unsigned level = 0;
  1493. for (size_t i = 0; i < ctx->textlen; i++)
  1494. level = max(level, ctx->levels[i]);
  1495. for (; level >= 1; level--) {
  1496. for (size_t i = 0; i < ctx->textlen; i++) {
  1497. if (ctx->levels[i] >= level) {
  1498. size_t start = i;
  1499. while (i+1 < ctx->textlen && ctx->levels[i+1] >= level)
  1500. i++;
  1501. reverse(ctx, start, i);
  1502. }
  1503. }
  1504. }
  1505. }
  1506. /*
  1507. * The Main Bidi Function. The two wrappers below it present different
  1508. * external APIs for different purposes, but everything comes through
  1509. * here.
  1510. *
  1511. * text: a buffer of size textlen containing text to apply the
  1512. * Bidirectional algorithm to.
  1513. */
  1514. static void do_bidi_new(BidiContext *ctx, bidi_char *text, size_t textlen)
  1515. {
  1516. ensure_arrays(ctx, textlen);
  1517. ctx->text = text;
  1518. ctx->textlen = textlen;
  1519. setup_types(ctx);
  1520. /* Quick initial test: see if we need to bother with any work at all */
  1521. if (!text_needs_bidi(ctx))
  1522. return;
  1523. set_paragraph_level(ctx);
  1524. process_explicit_embeddings(ctx);
  1525. remove_embedding_characters(ctx);
  1526. find_isolating_run_sequences(ctx, process_isolating_run_sequence);
  1527. /* If this implementation distinguished paragraphs from lines,
  1528. * then this would be the point where we repeat the remainder of
  1529. * the algorithm once for each line in the paragraph. */
  1530. reset_whitespace_and_separators(ctx);
  1531. mirror_glyphs(ctx);
  1532. reverse_sequences(ctx);
  1533. }
  1534. size_t do_bidi_test(BidiContext *ctx, bidi_char *text, size_t textlen,
  1535. int override)
  1536. {
  1537. ctx->paragraphOverride = (override > 0 ? L : override < 0 ? R : ON);
  1538. do_bidi_new(ctx, text, textlen);
  1539. return ctx->textlen;
  1540. }
  1541. void do_bidi(BidiContext *ctx, bidi_char *text, size_t textlen)
  1542. {
  1543. #ifdef REMOVE_FORMATTING_CHARACTERS
  1544. abort(); /* can't use the standard algorithm in a live terminal */
  1545. #else
  1546. ctx->paragraphOverride = ON;
  1547. do_bidi_new(ctx, text, textlen);
  1548. #endif
  1549. }