123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686 |
- /*
- * Implementation of the Unicode bidirectional and Arabic shaping
- * algorithms for PuTTY.
- *
- * Original version written and kindly contributed to this code base
- * by Ahmad Khalifa of Arabeyes. The bidi part was almost completely
- * rewritten in 2021 by Simon Tatham to bring it up to date, but the
- * shaping part is still the one by the original authors.
- *
- * Implementation notes:
- *
- * Algorithm version
- * -----------------
- *
- * This algorithm is up to date with Unicode Standard Annex #9
- * revision 46:
- *
- * https://www.unicode.org/reports/tr9/tr9-46.html
- *
- * and passes the full conformance test suite in Unicode 15.0.0.
- *
- * Paragraph and line handling
- * ---------------------------
- *
- * The full Unicode bidi algorithm expects to receive text containing
- * multiple paragraphs, together with a decision about how those
- * paragraphs are broken up into lines. It calculates embedding levels
- * a whole paragraph at a time without considering the line breaks,
- * but then the final reordering of the text for display is done to
- * each _line_ independently based on the levels computed for the text
- * in that line.
- *
- * This algorithm omits all of that, because it's intended for use as
- * a display-time transformation of a text terminal, which doesn't
- * preserve enough semantic information to decide what's a paragraph
- * break and what is not. So a piece of input text provided to this
- * algorithm is always expected to consist of exactly one paragraph
- * *and* exactly one line.
- *
- * Embeddings, overrides and isolates
- * ----------------------------------
- *
- * This implementation has full support for all the Unicode special
- * control characters that modify bidi behaviour, such as
- *
- * U+202A LEFT-TO-RIGHT EMBEDDING
- * U+202B RIGHT-TO-LEFT EMBEDDING
- * U+202D LEFT-TO-RIGHT OVERRIDE
- * U+202E RIGHT-TO-LEFT OVERRIDE
- * U+202C POP DIRECTIONAL FORMATTING
- * U+2068 FIRST STRONG ISOLATE
- * U+2066 LEFT-TO-RIGHT ISOLATE
- * U+2067 RIGHT-TO-LEFT ISOLATE
- * U+2069 POP DIRECTIONAL ISOLATE
- *
- * However, at present, the terminal emulator that is a client of this
- * code has no way to pass those in (because they're dropped during
- * escape sequence processing and don't get stored in the terminal
- * state). Nonetheless, the code is all here, so if the terminal
- * emulator becomes able to record those characters at some later
- * point, we'll be all set to take account of them during bidi.
- *
- * But the _main_ purpose of supporting the full bidi algorithm is
- * simply that that's the easiest way to be sure it's correct, because
- * if you support the whole thing, you can run the full conformance
- * test suite. (And I don't 100% believe that restricting to the
- * subset of _tests_ valid with a reduced character set will test the
- * full set of _functionality_ relevant to the reduced set.)
- *
- * Retained formatting characters
- * ------------------------------
- *
- * The standard bidi algorithm, in step X9, deletes assorted
- * formatting characters from the text: all the embedding and override
- * section initiator characters, the Pop Directional Formatting
- * character that closes one of those sections again, and any
- * character labelled as Boundary Neutral. So the characters it
- * returns are not a _full_ reordering of the input; some input
- * characters vanish completely.
- *
- * This would be fine, if it were not for the fact that - as far as I
- * can see - _exactly one_ Unicode code point in the discarded
- * category has a wcwidth() of more than 0, namely U+00AD SOFT HYPHEN
- * which is a printing character for terminal purposes but has a bidi
- * class of BN.
- *
- * Therefore, we must implement a modified version of the algorithm,
- * as described in section 5.2 of TR9, which retains those formatting
- * characters so that a client can find out where they ended up in the
- * reordering.
- *
- * Section 5.2 describes a set of modifications to the algorithm that
- * are _intended_ to achieve this without changing the rest of the
- * behaviour: that is, if you take the output of the modified
- * algorithm and delete all the characters that the standard algorithm
- * would have removed, you should end up with the remaining characters
- * in the same order that the standard algorithm would have delivered.
- * However, section 5.2 admits the possibility of error, and says "in
- * case of any deviation the explicit algorithm is the normative
- * statement for conformance". And indeed, in one or two places I
- * found I had to make my own tweaks to the section 5.2 description in
- * order to get the whole test suite to pass, because I think the 5.2
- * modifications if taken literally don't quite achieve that. My
- * justification is that sentence of 5.2: in case of doubt, the right
- * thing is to make the code behave the same as the official
- * algorithm.
- *
- * It's possible that there might still be some undiscovered
- * discrepancies between the behaviour of the standard and modified
- * algorithms. So, just in case, I've kept in this code the ability to
- * implement the _standard_ algorithm too! If you compile with
- * -DREMOVE_FORMATTING_CHARS, this code should go back to implementing
- * the literal UAX#9 bidi algorithm - so you can run your suspect
- * input through both versions, making it much easier to figure out
- * why they differ, and in which of the many stages of the algorithm
- * the difference was introduced.
- *
- * However, beware that when compiling in this mode, the do_bidi
- * interface to the terminal will stop working, and just abort() when
- * called! The only useful thing you can do with this mode is to run
- * the companion program bidi_test.c.
- */
- #include <stdlib.h> /* definition of wchar_t */
- #include "putty.h"
- #include "misc.h"
- #include "bidi.h"
- typedef struct {
- char type;
- wchar_t form_b;
- } shape_node;
- /* Kept near the actual table, for verification. */
- #define SHAPE_FIRST 0x621
- #define SHAPE_LAST (SHAPE_FIRST + lenof(shapetypes) - 1)
- static const shape_node shapetypes[] = {
- /* index, Typ, Iso, Ligature Index*/
- /* 621 */ {SU, 0xFE80},
- /* 622 */ {SR, 0xFE81},
- /* 623 */ {SR, 0xFE83},
- /* 624 */ {SR, 0xFE85},
- /* 625 */ {SR, 0xFE87},
- /* 626 */ {SD, 0xFE89},
- /* 627 */ {SR, 0xFE8D},
- /* 628 */ {SD, 0xFE8F},
- /* 629 */ {SR, 0xFE93},
- /* 62A */ {SD, 0xFE95},
- /* 62B */ {SD, 0xFE99},
- /* 62C */ {SD, 0xFE9D},
- /* 62D */ {SD, 0xFEA1},
- /* 62E */ {SD, 0xFEA5},
- /* 62F */ {SR, 0xFEA9},
- /* 630 */ {SR, 0xFEAB},
- /* 631 */ {SR, 0xFEAD},
- /* 632 */ {SR, 0xFEAF},
- /* 633 */ {SD, 0xFEB1},
- /* 634 */ {SD, 0xFEB5},
- /* 635 */ {SD, 0xFEB9},
- /* 636 */ {SD, 0xFEBD},
- /* 637 */ {SD, 0xFEC1},
- /* 638 */ {SD, 0xFEC5},
- /* 639 */ {SD, 0xFEC9},
- /* 63A */ {SD, 0xFECD},
- /* 63B */ {SU, 0x0},
- /* 63C */ {SU, 0x0},
- /* 63D */ {SU, 0x0},
- /* 63E */ {SU, 0x0},
- /* 63F */ {SU, 0x0},
- /* 640 */ {SC, 0x0},
- /* 641 */ {SD, 0xFED1},
- /* 642 */ {SD, 0xFED5},
- /* 643 */ {SD, 0xFED9},
- /* 644 */ {SD, 0xFEDD},
- /* 645 */ {SD, 0xFEE1},
- /* 646 */ {SD, 0xFEE5},
- /* 647 */ {SD, 0xFEE9},
- /* 648 */ {SR, 0xFEED},
- /* 649 */ {SR, 0xFEEF}, /* SD */
- /* 64A */ {SD, 0xFEF1},
- /* 64B */ {SU, 0x0},
- /* 64C */ {SU, 0x0},
- /* 64D */ {SU, 0x0},
- /* 64E */ {SU, 0x0},
- /* 64F */ {SU, 0x0},
- /* 650 */ {SU, 0x0},
- /* 651 */ {SU, 0x0},
- /* 652 */ {SU, 0x0},
- /* 653 */ {SU, 0x0},
- /* 654 */ {SU, 0x0},
- /* 655 */ {SU, 0x0},
- /* 656 */ {SU, 0x0},
- /* 657 */ {SU, 0x0},
- /* 658 */ {SU, 0x0},
- /* 659 */ {SU, 0x0},
- /* 65A */ {SU, 0x0},
- /* 65B */ {SU, 0x0},
- /* 65C */ {SU, 0x0},
- /* 65D */ {SU, 0x0},
- /* 65E */ {SU, 0x0},
- /* 65F */ {SU, 0x0},
- /* 660 */ {SU, 0x0},
- /* 661 */ {SU, 0x0},
- /* 662 */ {SU, 0x0},
- /* 663 */ {SU, 0x0},
- /* 664 */ {SU, 0x0},
- /* 665 */ {SU, 0x0},
- /* 666 */ {SU, 0x0},
- /* 667 */ {SU, 0x0},
- /* 668 */ {SU, 0x0},
- /* 669 */ {SU, 0x0},
- /* 66A */ {SU, 0x0},
- /* 66B */ {SU, 0x0},
- /* 66C */ {SU, 0x0},
- /* 66D */ {SU, 0x0},
- /* 66E */ {SU, 0x0},
- /* 66F */ {SU, 0x0},
- /* 670 */ {SU, 0x0},
- /* 671 */ {SR, 0xFB50},
- /* 672 */ {SU, 0x0},
- /* 673 */ {SU, 0x0},
- /* 674 */ {SU, 0x0},
- /* 675 */ {SU, 0x0},
- /* 676 */ {SU, 0x0},
- /* 677 */ {SU, 0x0},
- /* 678 */ {SU, 0x0},
- /* 679 */ {SD, 0xFB66},
- /* 67A */ {SD, 0xFB5E},
- /* 67B */ {SD, 0xFB52},
- /* 67C */ {SU, 0x0},
- /* 67D */ {SU, 0x0},
- /* 67E */ {SD, 0xFB56},
- /* 67F */ {SD, 0xFB62},
- /* 680 */ {SD, 0xFB5A},
- /* 681 */ {SU, 0x0},
- /* 682 */ {SU, 0x0},
- /* 683 */ {SD, 0xFB76},
- /* 684 */ {SD, 0xFB72},
- /* 685 */ {SU, 0x0},
- /* 686 */ {SD, 0xFB7A},
- /* 687 */ {SD, 0xFB7E},
- /* 688 */ {SR, 0xFB88},
- /* 689 */ {SU, 0x0},
- /* 68A */ {SU, 0x0},
- /* 68B */ {SU, 0x0},
- /* 68C */ {SR, 0xFB84},
- /* 68D */ {SR, 0xFB82},
- /* 68E */ {SR, 0xFB86},
- /* 68F */ {SU, 0x0},
- /* 690 */ {SU, 0x0},
- /* 691 */ {SR, 0xFB8C},
- /* 692 */ {SU, 0x0},
- /* 693 */ {SU, 0x0},
- /* 694 */ {SU, 0x0},
- /* 695 */ {SU, 0x0},
- /* 696 */ {SU, 0x0},
- /* 697 */ {SU, 0x0},
- /* 698 */ {SR, 0xFB8A},
- /* 699 */ {SU, 0x0},
- /* 69A */ {SU, 0x0},
- /* 69B */ {SU, 0x0},
- /* 69C */ {SU, 0x0},
- /* 69D */ {SU, 0x0},
- /* 69E */ {SU, 0x0},
- /* 69F */ {SU, 0x0},
- /* 6A0 */ {SU, 0x0},
- /* 6A1 */ {SU, 0x0},
- /* 6A2 */ {SU, 0x0},
- /* 6A3 */ {SU, 0x0},
- /* 6A4 */ {SD, 0xFB6A},
- /* 6A5 */ {SU, 0x0},
- /* 6A6 */ {SD, 0xFB6E},
- /* 6A7 */ {SU, 0x0},
- /* 6A8 */ {SU, 0x0},
- /* 6A9 */ {SD, 0xFB8E},
- /* 6AA */ {SU, 0x0},
- /* 6AB */ {SU, 0x0},
- /* 6AC */ {SU, 0x0},
- /* 6AD */ {SD, 0xFBD3},
- /* 6AE */ {SU, 0x0},
- /* 6AF */ {SD, 0xFB92},
- /* 6B0 */ {SU, 0x0},
- /* 6B1 */ {SD, 0xFB9A},
- /* 6B2 */ {SU, 0x0},
- /* 6B3 */ {SD, 0xFB96},
- /* 6B4 */ {SU, 0x0},
- /* 6B5 */ {SU, 0x0},
- /* 6B6 */ {SU, 0x0},
- /* 6B7 */ {SU, 0x0},
- /* 6B8 */ {SU, 0x0},
- /* 6B9 */ {SU, 0x0},
- /* 6BA */ {SR, 0xFB9E},
- /* 6BB */ {SD, 0xFBA0},
- /* 6BC */ {SU, 0x0},
- /* 6BD */ {SU, 0x0},
- /* 6BE */ {SD, 0xFBAA},
- /* 6BF */ {SU, 0x0},
- /* 6C0 */ {SR, 0xFBA4},
- /* 6C1 */ {SD, 0xFBA6},
- /* 6C2 */ {SU, 0x0},
- /* 6C3 */ {SU, 0x0},
- /* 6C4 */ {SU, 0x0},
- /* 6C5 */ {SR, 0xFBE0},
- /* 6C6 */ {SR, 0xFBD9},
- /* 6C7 */ {SR, 0xFBD7},
- /* 6C8 */ {SR, 0xFBDB},
- /* 6C9 */ {SR, 0xFBE2},
- /* 6CA */ {SU, 0x0},
- /* 6CB */ {SR, 0xFBDE},
- /* 6CC */ {SD, 0xFBFC},
- /* 6CD */ {SU, 0x0},
- /* 6CE */ {SU, 0x0},
- /* 6CF */ {SU, 0x0},
- /* 6D0 */ {SU, 0x0},
- /* 6D1 */ {SU, 0x0},
- /* 6D2 */ {SR, 0xFBAE},
- };
- /*
- * Returns the bidi character type of ch.
- */
- unsigned char bidi_getType(int ch)
- {
- static const struct {
- int first, last, type;
- } lookup[] = {
- #include "unicode/bidi_type.h"
- };
- int i, j, k;
- i = -1;
- j = lenof(lookup);
- while (j - i > 1) {
- k = (i + j) / 2;
- if (ch < lookup[k].first)
- j = k;
- else if (ch > lookup[k].last)
- i = k;
- else
- return lookup[k].type;
- }
- /*
- * If we reach here, the character was not in any of the
- * intervals listed in the lookup table. This means we return
- * ON (`Other Neutrals'). This is the appropriate code for any
- * character genuinely not listed in the Unicode table, and
- * also the table above has deliberately left out any
- * characters _explicitly_ listed as ON (to save space!).
- */
- return ON;
- }
- /*
- * Return the mirrored version of a glyph.
- *
- * FIXME: there are also glyphs which the text rendering engine is
- * supposed to display left-right reflected, since no mirrored glyph
- * exists in Unicode itself to indicate the reflected form. Those are
- * listed in comments in BidiMirroring.txt. Many of them are
- * mathematical, e.g. the square root sign, or set difference
- * operator, or integral sign. No API currently exists here to
- * communicate the need for that reflected display back to the client.
- */
- static unsigned mirror_glyph(unsigned int ch)
- {
- static const struct {
- unsigned src, dst;
- } mirror_pairs[] = {
- #include "unicode/bidi_mirror.h"
- };
- int i, j, k;
- i = -1;
- j = lenof(mirror_pairs);
- while (j - i > 1) {
- k = (i + j) / 2;
- if (ch < mirror_pairs[k].src)
- j = k;
- else if (ch > mirror_pairs[k].src)
- i = k;
- else
- return mirror_pairs[k].dst;
- }
- return ch;
- }
- /*
- * Identify the bracket characters treated specially by bidi rule
- * BD19, and return their paired character(s).
- */
- typedef enum { BT_NONE, BT_OPEN, BT_CLOSE } BracketType;
- typedef struct BracketTypeData {
- unsigned partner, equiv_partner;
- BracketType type;
- } BracketTypeData;
- static BracketTypeData bracket_type(unsigned int ch)
- {
- static const struct {
- unsigned src;
- BracketTypeData payload;
- } bracket_pairs[] = {
- #include "unicode/bidi_brackets.h"
- };
- int i, j, k;
- i = -1;
- j = lenof(bracket_pairs);
- while (j - i > 1) {
- k = (i + j) / 2;
- if (ch < bracket_pairs[k].src) {
- j = k;
- } else if (ch > bracket_pairs[k].src) {
- i = k;
- } else {
- return bracket_pairs[k].payload;
- }
- }
- static const BracketTypeData null = { 0, 0, BT_NONE };
- return null;
- }
- /*
- * Function exported to front ends to allow them to identify
- * bidi-active characters (in case, for example, the platform's
- * text display function can't conveniently be prevented from doing
- * its own bidi and so special treatment is required for characters
- * that would cause the bidi algorithm to activate).
- *
- * This function is passed a single Unicode code point, and returns
- * nonzero if the presence of this code point can possibly cause
- * the bidi algorithm to do any reordering. Thus, any string
- * composed entirely of characters for which is_rtl() returns zero
- * should be safe to pass to a bidi-active platform display
- * function without fear.
- *
- * (is_rtl() must therefore also return true for any character
- * which would be affected by Arabic shaping, but this isn't
- * important because all such characters are right-to-left so it
- * would have flagged them anyway.)
- */
- bool is_rtl(int c)
- {
- return typeIsBidiActive(bidi_getType(c));
- }
- /* The Main shaping function, and the only one to be used
- * by the outside world.
- *
- * line: buffer to apply shaping to. this must be passed by doBidi() first
- * to: output buffer for the shaped data
- * count: number of characters in line
- */
- int do_shape(bidi_char *line, bidi_char *to, int count)
- {
- int i, tempShape;
- bool ligFlag = false;
- for (i=0; i<count; i++) {
- to[i] = line[i];
- tempShape = STYPE(line[i].wc);
- switch (tempShape) {
- case SC:
- break;
- case SU:
- break;
- case SR:
- tempShape = (i+1 < count ? STYPE(line[i+1].wc) : SU);
- if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
- to[i].wc = SFINAL((SISOLATED(line[i].wc)));
- else
- to[i].wc = SISOLATED(line[i].wc);
- break;
- case SD:
- /* Make Ligatures */
- tempShape = (i+1 < count ? STYPE(line[i+1].wc) : SU);
- if (line[i].wc == 0x644) {
- if (i > 0) switch (line[i-1].wc) {
- case 0x622:
- ligFlag = true;
- if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
- to[i].wc = 0xFEF6;
- else
- to[i].wc = 0xFEF5;
- break;
- case 0x623:
- ligFlag = true;
- if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
- to[i].wc = 0xFEF8;
- else
- to[i].wc = 0xFEF7;
- break;
- case 0x625:
- ligFlag = true;
- if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
- to[i].wc = 0xFEFA;
- else
- to[i].wc = 0xFEF9;
- break;
- case 0x627:
- ligFlag = true;
- if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
- to[i].wc = 0xFEFC;
- else
- to[i].wc = 0xFEFB;
- break;
- }
- if (ligFlag) {
- to[i-1].wc = 0x20;
- ligFlag = false;
- break;
- }
- }
- if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC)) {
- tempShape = (i > 0 ? STYPE(line[i-1].wc) : SU);
- if ((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
- to[i].wc = SMEDIAL((SISOLATED(line[i].wc)));
- else
- to[i].wc = SFINAL((SISOLATED(line[i].wc)));
- break;
- }
- tempShape = (i > 0 ? STYPE(line[i-1].wc) : SU);
- if ((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
- to[i].wc = SINITIAL((SISOLATED(line[i].wc)));
- else
- to[i].wc = SISOLATED(line[i].wc);
- break;
- }
- }
- return 1;
- }
- typedef enum { DO_NEUTRAL, DO_LTR, DO_RTL } DirectionalOverride;
- typedef struct DSStackEntry {
- /*
- * An entry in the directional status stack (rule section X).
- */
- unsigned char level;
- bool isolate;
- DirectionalOverride override;
- } DSStackEntry;
- typedef struct BracketStackEntry {
- /*
- * An entry in the bracket-pair-tracking stack (rule BD16).
- */
- unsigned ch;
- size_t c;
- } BracketStackEntry;
- typedef struct IsolatingRunSequence {
- size_t start, end;
- BidiType sos, eos, embeddingDirection;
- } IsolatingRunSequence;
- #define MAX_DEPTH 125 /* specified in the standard */
- struct BidiContext {
- /*
- * Storage space preserved between runs, all allocated to the same
- * length (internal_array_sizes).
- */
- size_t internal_array_sizes;
- BidiType *types, *origTypes;
- unsigned char *levels;
- size_t *irsindices, *bracketpos;
- bool *irsdone;
- /*
- * Separately allocated with its own size field
- */
- IsolatingRunSequence *irslist;
- size_t irslistsize;
- /*
- * Rewritten to point to the input to the currently active run of
- * the bidi algorithm
- */
- bidi_char *text;
- size_t textlen;
- /*
- * State within a run of the algorithm
- */
- BidiType paragraphOverride;
- DSStackEntry dsstack[MAX_DEPTH + 2];
- size_t ds_sp;
- size_t overflowIsolateCount, overflowEmbeddingCount, validIsolateCount;
- unsigned char paragraphLevel;
- size_t *irs;
- size_t irslen;
- BidiType sos, eos, embeddingDirection;
- BracketStackEntry bstack[63]; /* constant size specified in rule BD16 */
- };
- BidiContext *bidi_new_context(void)
- {
- BidiContext *ctx = snew(BidiContext);
- memset(ctx, 0, sizeof(BidiContext));
- return ctx;
- }
- void bidi_free_context(BidiContext *ctx)
- {
- sfree(ctx->types);
- sfree(ctx->origTypes);
- sfree(ctx->levels);
- sfree(ctx->irsindices);
- sfree(ctx->irsdone);
- sfree(ctx->bracketpos);
- sfree(ctx->irslist);
- sfree(ctx);
- }
- static void ensure_arrays(BidiContext *ctx, size_t textlen)
- {
- if (textlen <= ctx->internal_array_sizes)
- return;
- ctx->internal_array_sizes = textlen;
- ctx->types = sresize(ctx->types, ctx->internal_array_sizes, BidiType);
- ctx->origTypes = sresize(ctx->origTypes, ctx->internal_array_sizes,
- BidiType);
- ctx->levels = sresize(ctx->levels, ctx->internal_array_sizes,
- unsigned char);
- ctx->irsindices = sresize(ctx->irsindices, ctx->internal_array_sizes,
- size_t);
- ctx->irsdone = sresize(ctx->irsdone, ctx->internal_array_sizes, bool);
- ctx->bracketpos = sresize(ctx->bracketpos, ctx->internal_array_sizes,
- size_t);
- }
- static void setup_types(BidiContext *ctx)
- {
- for (size_t i = 0; i < ctx->textlen; i++)
- ctx->types[i] = ctx->origTypes[i] = bidi_getType(ctx->text[i].wc);
- }
- static bool text_needs_bidi(BidiContext *ctx)
- {
- /*
- * Initial optimisation: check for any bidi-active character at
- * all in an input line. If there aren't any, we can skip the
- * whole algorithm.
- *
- * Also include the paragraph override in this check!
- */
- for (size_t i = 0; i < ctx->textlen; i++)
- if (typeIsBidiActive(ctx->types[i]))
- return true;
- return typeIsBidiActive(ctx->paragraphOverride);
- }
- static size_t find_matching_pdi(const BidiType *types, size_t i, size_t size)
- {
- /* Assuming that types[i] is an isolate initiator, find its
- * matching PDI by rule BD9. */
- unsigned counter = 1;
- i++;
- for (; i < size; i++) {
- BidiType t = types[i];
- if (typeIsIsolateInitiator(t)) {
- counter++;
- } else if (t == PDI) {
- counter--;
- if (counter == 0)
- return i;
- }
- }
- /* If no PDI was found, return the length of the array. */
- return size;
- }
- static unsigned char rule_p2_p3(const BidiType *types, size_t size)
- {
- /*
- * Rule P2. Find the first strong type (L, R or AL), ignoring
- * anything inside an isolated segment.
- *
- * Rule P3. If that type is R or AL, choose a paragraph embeddding
- * level of 1, otherwise 0.
- */
- for (size_t i = 0; i < size; i++) {
- BidiType t = types[i];
- if (typeIsIsolateInitiator(t))
- i = find_matching_pdi(types, i, size);
- else if (typeIsStrong(t))
- return (t == L ? 0 : 1);
- }
- return 0; /* default if no strong type found */
- }
- static void set_paragraph_level(BidiContext *ctx)
- {
- if (ctx->paragraphOverride == L)
- ctx->paragraphLevel = 0;
- else if (ctx->paragraphOverride == R)
- ctx->paragraphLevel = 1;
- else
- ctx->paragraphLevel = rule_p2_p3(ctx->types, ctx->textlen);
- }
- static inline unsigned char nextOddLevel(unsigned char x) { return (x+1)|1; }
- static inline unsigned char nextEvenLevel(unsigned char x) { return (x|1)+1; }
- static inline void push(BidiContext *ctx, unsigned char level,
- DirectionalOverride override, bool isolate)
- {
- ctx->ds_sp++;
- assert(ctx->ds_sp < lenof(ctx->dsstack));
- ctx->dsstack[ctx->ds_sp].level = level;
- ctx->dsstack[ctx->ds_sp].override = override;
- ctx->dsstack[ctx->ds_sp].isolate = isolate;
- }
- static inline void pop(BidiContext *ctx)
- {
- assert(ctx->ds_sp > 0);
- ctx->ds_sp--;
- }
- static void process_explicit_embeddings(BidiContext *ctx)
- {
- /*
- * Rule X1 initialisation.
- */
- ctx->ds_sp = (size_t)-1;
- push(ctx, ctx->paragraphLevel, DO_NEUTRAL, false);
- ctx->overflowIsolateCount = 0;
- ctx->overflowEmbeddingCount = 0;
- ctx->validIsolateCount = 0;
- #define stk (&ctx->dsstack[ctx->ds_sp])
- for (size_t i = 0; i < ctx->textlen; i++) {
- BidiType t = ctx->types[i];
- switch (t) {
- case RLE: case LRE: case RLO: case LRO: {
- /* Rules X2-X5 */
- unsigned char newLevel;
- DirectionalOverride override;
- #ifndef REMOVE_FORMATTING_CHARS
- ctx->levels[i] = stk->level;
- #endif
- switch (t) {
- case RLE: /* rule X2 */
- newLevel = nextOddLevel(stk->level);
- override = DO_NEUTRAL;
- break;
- case LRE: /* rule X3 */
- newLevel = nextEvenLevel(stk->level);
- override = DO_NEUTRAL;
- break;
- case RLO: /* rule X4 */
- newLevel = nextOddLevel(stk->level);
- override = DO_RTL;
- break;
- case LRO: /* rule X5 */
- newLevel = nextEvenLevel(stk->level);
- override = DO_LTR;
- break;
- default:
- unreachable("how did this get past the outer switch?");
- }
- if (newLevel <= MAX_DEPTH &&
- ctx->overflowIsolateCount == 0 &&
- ctx->overflowEmbeddingCount == 0) {
- /* Embedding code is valid. Push a stack entry. */
- push(ctx, newLevel, override, false);
- } else {
- /* Embedding code is an overflow one. */
- if (ctx->overflowIsolateCount == 0)
- ctx->overflowEmbeddingCount++;
- }
- break;
- }
- case RLI: case LRI: case FSI: {
- /* Rules X5a, X5b, X5c */
- if (t == FSI) {
- /* Rule X5c: decide whether this should be treated
- * like RLI or LRI */
- size_t pdi = find_matching_pdi(ctx->types, i, ctx->textlen);
- unsigned char level = rule_p2_p3(ctx->types + (i + 1),
- pdi - (i + 1));
- t = (level == 1 ? RLI : LRI);
- }
- ctx->levels[i] = stk->level;
- if (stk->override != DO_NEUTRAL)
- ctx->types[i] = (stk->override == DO_LTR ? L :
- stk->override == DO_RTL ? R : t);
- unsigned char newLevel = (t == RLI ? nextOddLevel(stk->level) :
- nextEvenLevel(stk->level));
- if (newLevel <= MAX_DEPTH &&
- ctx->overflowIsolateCount == 0 &&
- ctx->overflowEmbeddingCount == 0) {
- /* Isolate code is valid. Push a stack entry. */
- push(ctx, newLevel, DO_NEUTRAL, true);
- ctx->validIsolateCount++;
- } else {
- /* Isolate code is an overflow one. */
- ctx->overflowIsolateCount++;
- }
- break;
- }
- case PDI: {
- /* Rule X6a */
- if (ctx->overflowIsolateCount > 0) {
- ctx->overflowIsolateCount--;
- } else if (ctx->validIsolateCount == 0) {
- /* Do nothing: spurious isolate-pop */
- } else {
- /* Valid isolate-pop. We expect that the stack must
- * therefore contain at least one isolate==true entry,
- * so pop everything up to and including it. */
- ctx->overflowEmbeddingCount = 0;
- while (!stk->isolate)
- pop(ctx);
- pop(ctx);
- ctx->validIsolateCount--;
- }
- ctx->levels[i] = stk->level;
- if (stk->override != DO_NEUTRAL)
- ctx->types[i] = (stk->override == DO_LTR ? L : R);
- break;
- }
- case PDF: {
- /* Rule X7 */
- if (ctx->overflowIsolateCount > 0) {
- /* Do nothing if we've overflowed on isolates */
- } else if (ctx->overflowEmbeddingCount > 0) {
- ctx->overflowEmbeddingCount--;
- } else if (ctx->ds_sp > 0 && !stk->isolate) {
- pop(ctx);
- } else {
- /* Do nothing: spurious embedding-pop */
- }
- #ifndef REMOVE_FORMATTING_CHARS
- ctx->levels[i] = stk->level;
- #endif
- break;
- }
- case B: {
- /* Rule X8: if an explicit paragraph separator appears in
- * this text at all then it does not participate in any of
- * the above, and just gets assigned the paragraph level.
- *
- * PS, it had better be right at the end of the text,
- * because we have not implemented rule P1 in this code. */
- assert(i == ctx->textlen - 1);
- ctx->levels[i] = ctx->paragraphLevel;
- break;
- }
- case BN: {
- /*
- * The section 5.2 adjustment to rule X6 says that we
- * apply it to BN just like any other class. But I think
- * this can't possibly give the same results as the
- * unmodified algorithm.
- *
- * Proof: adding RLO BN or LRO BN at the end of a
- * paragraph should not change the output of the standard
- * algorithm, because the override doesn't affect the BN
- * in rule X6, and then rule X9 removes both. But with the
- * modified rule X6, the BN is changed into R or L, and
- * then rule X9 doesn't remove it, and then you've added a
- * strong type that will set eos for the level run just
- * before the override. And whatever the standard
- * algorithm set eos to, _one_ of these override sequences
- * will disagree with it.
- *
- * So I think we just set the BN's level, and don't change
- * its type.
- */
- ctx->levels[i] = stk->level;
- break;
- }
- default: {
- /* Rule X6. */
- ctx->levels[i] = stk->level;
- if (stk->override != DO_NEUTRAL)
- ctx->types[i] = (stk->override == DO_LTR ? L : R);
- break;
- }
- }
- }
- #undef stk
- }
- static void remove_embedding_characters(BidiContext *ctx)
- {
- #ifndef REMOVE_FORMATTING_CHARS
- /*
- * Rule X9, as modified by section 5.2: turn embedding (but not
- * isolate) characters into BN.
- */
- for (size_t i = 0; i < ctx->textlen; i++) {
- BidiType t = ctx->types[i];
- if (typeIsRemovedDuringProcessing(t)) {
- ctx->types[i] = BN;
- /*
- * My own adjustment to the section 5.2 mods: a sequence
- * of contiguous BN generated by this setup should never
- * be at different levels from each other.
- *
- * An example where this goes wrong is if you open two
- * LREs in sequence, then close them again:
- *
- * ... LRE LRE PDF PDF ...
- *
- * The initial level assignment gives level 0 to the outer
- * LRE/PDF pair, and level 2 to the inner one. The
- * standard algorithm would remove all four, so this
- * doesn't matter, and you end up with no break in the
- * surrounding level run. But if you just rewrite the
- * types of all those characters to BN and leave the
- * levels in that state, then the modified algorithm will
- * leave the middle two BN at level 2, dividing what
- * should have been a long level run at level 0 into two
- * separate ones.
- */
- if (i > 0 && ctx->types[i-1] == BN)
- ctx->levels[i] = ctx->levels[i-1];
- }
- }
- #else
- /*
- * Rule X9, original version: completely remove embedding
- * start/end characters and also boundary neutrals.
- */
- size_t outpos = 0;
- for (size_t i = 0; i < ctx->textlen; i++) {
- BidiType t = ctx->types[i];
- if (!typeIsRemovedDuringProcessing(t)) {
- ctx->text[outpos] = ctx->text[i];
- ctx->levels[outpos] = ctx->levels[i];
- ctx->types[outpos] = ctx->types[i];
- ctx->origTypes[outpos] = ctx->origTypes[i];
- outpos++;
- }
- }
- ctx->textlen = outpos;
- #endif
- }
- typedef void (*irs_fn_t)(BidiContext *ctx);
- static void find_isolating_run_sequences(BidiContext *ctx, irs_fn_t process)
- {
- /*
- * Rule X10 / BD13. Now that we've assigned an embedding level to
- * each character in the text, we have to divide the text into
- * subsequences on which to do the next stage of processing.
- *
- * In earlier issues of the bidi algorithm, these subsequences
- * were contiguous in the original text, and each one was a 'level
- * run': a maximal contiguous subsequence of characters all at the
- * same embedding level.
- *
- * But now we have isolates, and the point of an (isolate
- * initiator ... PDI) sequence is that the whole sequence should
- * be treated like a single BN for the purposes of formatting
- * everything outside it. As a result, we now have to recombine
- * our level runs into longer sequences, on the principle that if
- * a level run ends with an isolate initiator, then we bring it
- * together with whatever later level run starts with the matching
- * PDI.
- *
- * These subsequences are no longer contiguous (the whole point is
- * that between the isolate initiator and the PDI is some other
- * text that we've skipped over). They're called 'isolating run
- * sequences'.
- */
- memset(ctx->irsdone, 0, ctx->textlen);
- size_t i = 0;
- size_t n_irs = 0;
- size_t indexpos = 0;
- while (i < ctx->textlen) {
- if (ctx->irsdone[i]) {
- i++;
- continue;
- }
- /*
- * Found a character not already processed. Start a new
- * sequence here.
- */
- sgrowarray(ctx->irslist, ctx->irslistsize, n_irs);
- IsolatingRunSequence *irs = &ctx->irslist[n_irs++];
- irs->start = indexpos;
- size_t j = i;
- size_t irslevel = ctx->levels[i];
- while (j < ctx->textlen) {
- /*
- * We expect that all level runs in this sequence will be
- * at the same level as each other, by construction of how
- * we set up the levels from the isolates in the first
- * place.
- */
- assert(ctx->levels[j] == irslevel);
- do {
- ctx->irsdone[j] = true;
- ctx->irsindices[indexpos++] = j++;
- } while (j < ctx->textlen && ctx->levels[j] == irslevel);
- if (!typeIsIsolateInitiator(ctx->types[j-1]))
- break; /* this IRS is ended */
- j = find_matching_pdi(ctx->types, j-1, ctx->textlen);
- }
- irs->end = indexpos;
- /*
- * Determine the start-of-sequence and end-of-sequence types
- * for this sequence.
- *
- * These depend on the embedding levels of surrounding text.
- * But processing each run can change those levels. That's why
- * we have to use a two-pass strategy here, first identifying
- * all the isolating run sequences using the input level data,
- * and not processing any of them until we know where they all
- * are.
- */
- size_t p;
- unsigned char level_inside, level_outside, level_max;
- p = i;
- level_inside = ctx->levels[p];
- level_outside = ctx->paragraphLevel;
- while (p > 0) {
- p--;
- if (ctx->types[p] != BN) {
- level_outside = ctx->levels[p];
- break;
- }
- }
- level_max = max(level_inside, level_outside);
- irs->sos = (level_max % 2 ? R : L);
- p = ctx->irsindices[irs->end - 1];
- level_inside = ctx->levels[p];
- level_outside = ctx->paragraphLevel;
- if (typeIsIsolateInitiator(ctx->types[p])) {
- /* Special case: if an isolating run sequence ends in an
- * unmatched isolate initiator, then level_outside is
- * taken to be the paragraph embedding level and the
- * loop below is skipped. */
- } else {
- while (p+1 < ctx->textlen) {
- p++;
- if (ctx->types[p] != BN) {
- level_outside = ctx->levels[p];
- break;
- }
- }
- }
- level_max = max(level_inside, level_outside);
- irs->eos = (level_max % 2 ? R : L);
- irs->embeddingDirection = (irslevel % 2 ? R : L);
- /*
- * Now we've listed in ctx->irsindices[] the index of every
- * character that's part of this isolating run sequence, and
- * recorded an entry in irslist containing the interval of
- * indices relevant to this IRS, plus its assorted metadata.
- * We've also marked those locations in the input text as done
- * in ctx->irsdone, so that we'll skip over them when the
- * outer iteration reaches them later.
- */
- }
- for (size_t k = 0; k < n_irs; k++) {
- IsolatingRunSequence *irs = &ctx->irslist[k];
- ctx->irs = ctx->irsindices + irs->start;
- ctx->irslen = irs->end - irs->start;
- ctx->sos = irs->sos;
- ctx->eos = irs->eos;
- ctx->embeddingDirection = irs->embeddingDirection;
- process(ctx);
- }
- /* Reset irslen to 0 when we've finished. This means any other
- * functions that absentmindedly try to use irslen at all will end
- * up doing nothing at all, which should be easier to detect and
- * debug than if they run on subtly the wrong subset of the
- * text. */
- ctx->irslen = 0;
- }
- static void remove_nsm(BidiContext *ctx)
- {
- /* Rule W1: NSM gains the type of the previous character, or sos
- * at the start of the run, with the exception that isolation
- * boundaries turn into ON. */
- BidiType prevType = ctx->sos;
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (t == NSM) {
- ctx->types[i] = prevType;
- } else if (typeIsIsolateInitiatorOrPDI(t)) {
- prevType = ON;
- #ifndef REMOVE_FORMATTING_CHARS
- } else if (t == BN) {
- /* section 5.2 adjustment: these don't affect prevType */
- #endif
- } else {
- prevType = t;
- }
- }
- }
- static void change_en_to_an(BidiContext *ctx)
- {
- /* Rule W2: EN becomes AN if the previous strong type is AL. (The
- * spec says that the 'previous strong type' is counted as sos at
- * the start of the run, although it hardly matters, since sos
- * can't be AL.) */
- BidiType prevStrongType = ctx->sos;
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (t == EN && prevStrongType == AL) {
- ctx->types[i] = AN;
- } else if (typeIsStrong(t)) {
- prevStrongType = t;
- }
- }
- }
- static void change_al_to_r(BidiContext *ctx)
- {
- /* Rule W3: AL becomes R unconditionally. (The only difference
- * between the two types was their effect on nearby numbers, which
- * was dealt with in rule W2, so now we're done with the
- * distinction.) */
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- if (ctx->types[i] == AL)
- ctx->types[i] = R;
- }
- }
- static void eliminate_separators_between_numbers(BidiContext *ctx)
- {
- /* Rule W4: a single numeric separator between two numbers of the
- * same type compatible with that separator takes the type of the
- * number. ES is a separator type compatible only with EN; CS is a
- * separator type compatible with either EN or AN.
- *
- * Section 5.2 adjustment: intervening BNs do not break this, so
- * instead of simply looking at types[irs[c-1]] and types[irs[c+1]],
- * we must track the last three indices we saw that were not BN. */
- size_t i1 = 0, i2 = 0;
- BidiType t0 = ON, t1 = ON, t2 = ON;
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- #ifndef REMOVE_FORMATTING_CHARS
- if (t == BN)
- continue;
- #endif
- i1 = i2; i2 = i;
- t0 = t1; t1 = t2; t2 = t;
- if (t0 == t2 && ((t1 == ES && t0 == EN) ||
- (t1 == CS && (t0 == EN || t0 == AN)))) {
- ctx->types[i1] = t0;
- }
- }
- }
- static void eliminate_et_next_to_en(BidiContext *ctx)
- {
- /* Rule W5: a sequence of ET adjacent to an EN take the type EN.
- * This is easiest to implement with one loop in each direction.
- *
- * Section 5.2 adjustment: include BN with ET. (We don't need to
- * #ifdef that out, because in the standard algorithm, we won't
- * have any BN left in any case.) */
- bool modifying = false;
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (t == EN) {
- modifying = true;
- } else if (modifying && typeIsETOrBN(t)) {
- ctx->types[i] = EN;
- } else {
- modifying = false;
- }
- }
- for (size_t c = ctx->irslen; c-- > 0 ;) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (t == EN) {
- modifying = true;
- } else if (modifying && typeIsETOrBN(t)) {
- ctx->types[i] = EN;
- } else {
- modifying = false;
- }
- }
- }
- static void eliminate_separators_and_terminators(BidiContext *ctx)
- {
- /* Rule W6: all separators and terminators change to ON.
- *
- * (The spec is not quite clear on which bidi types are included
- * in this; one assumes ES, ET and CS, but what about S? I _think_
- * the answer is that this is a rule in the W section, so it's
- * implicitly supposed to only apply to types designated as weakly
- * directional, so not S.) */
- #ifndef REMOVE_FORMATTING_CHARS
- /*
- * Section 5.2 adjustment: this also applies to any BN adjacent on
- * either side to one of these types, which is easiest to
- * implement with a separate double-loop converting those to an
- * arbitrary one of the affected types, say CS.
- *
- * This double loop can be completely skipped in the standard
- * algorithm.
- */
- bool modifying = false;
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (typeIsWeakSeparatorOrTerminator(t)) {
- modifying = true;
- } else if (modifying && t == BN) {
- ctx->types[i] = CS;
- } else {
- modifying = false;
- }
- }
- for (size_t c = ctx->irslen; c-- > 0 ;) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (typeIsWeakSeparatorOrTerminator(t)) {
- modifying = true;
- } else if (modifying && t == BN) {
- ctx->types[i] = CS;
- } else {
- modifying = false;
- }
- }
- #endif
- /* Now the main part of rule W6 */
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (typeIsWeakSeparatorOrTerminator(t))
- ctx->types[i] = ON;
- }
- }
- static void change_en_to_l(BidiContext *ctx)
- {
- /* Rule W7: EN becomes L if the previous strong type (or sos) is L. */
- BidiType prevStrongType = ctx->sos;
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (t == EN && prevStrongType == L) {
- ctx->types[i] = L;
- } else if (typeIsStrong(t)) {
- prevStrongType = t;
- }
- }
- }
- typedef void (*bracket_pair_fn)(BidiContext *ctx, size_t copen, size_t cclose);
- static void find_bracket_pairs(BidiContext *ctx, bracket_pair_fn process)
- {
- const size_t NO_BRACKET = ~(size_t)0;
- /*
- * Rule BD16.
- */
- size_t sp = 0;
- for (size_t c = 0; c < ctx->irslen; c++)
- ctx->bracketpos[c] = NO_BRACKET;
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- unsigned wc = ctx->text[i].wc;
- BracketTypeData bt = bracket_type(wc);
- if (bt.type == BT_OPEN) {
- if (sp >= lenof(ctx->bstack)) {
- /*
- * Stack overflow. The spec says we simply give up at
- * this point.
- */
- goto found_all_pairs;
- }
- ctx->bstack[sp].ch = wc;
- ctx->bstack[sp].c = c;
- sp++;
- } else if (bt.type == BT_CLOSE) {
- size_t new_sp = sp;
- /*
- * Search up the stack for an entry containing a matching
- * open bracket. If we find it, pop that entry and
- * everything deeper, and record a matching pair. If we
- * reach the bottom of the stack without finding anything,
- * leave sp where it started.
- */
- while (new_sp-- > 0) {
- if (ctx->bstack[new_sp].ch == bt.partner ||
- ctx->bstack[new_sp].ch == bt.equiv_partner) {
- /* Found a stack element matching this one */
- size_t cstart = ctx->bstack[new_sp].c;
- ctx->bracketpos[cstart] = c;
- sp = new_sp;
- break;
- }
- }
- }
- }
- found_all_pairs:
- for (size_t c = 0; c < ctx->irslen; c++) {
- if (ctx->bracketpos[c] != NO_BRACKET) {
- process(ctx, c, ctx->bracketpos[c]);
- }
- }
- }
- static BidiType get_bracket_type(BidiContext *ctx, size_t copen, size_t cclose)
- {
- /*
- * Rule N0: a pair of matched brackets containing at least one
- * strong type takes on the current embedding direction, unless
- * all of these are true at once:
- *
- * (a) there are no strong types inside the brackets matching the
- * current embedding direction
- * (b) there _is_ at least one strong type inside the brackets
- * that is _opposite_ to the current embedding direction
- * (c) the strong type preceding the open bracket is also
- * opposite to the current embedding direction
- *
- * in which case they take on the opposite direction.
- *
- * For these purposes, number types (EN and AN) count as R.
- */
- bool foundOppositeTypeInside = false;
- for (size_t c = copen + 1; c < cclose; c++) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (typeIsStrongOrNumber(t)) {
- t = t == L ? L : R; /* numbers count as R */
- if (t == ctx->embeddingDirection) {
- /* Found something inside the brackets matching the
- * current level, so (a) is violated. */
- return ctx->embeddingDirection;
- } else {
- foundOppositeTypeInside = true;
- }
- }
- }
- if (!foundOppositeTypeInside) {
- /* No strong types at all inside the brackets, so return ON to
- * indicate that we're not messing with their type at all. */
- return ON;
- }
- /* There was an opposite strong type in the brackets. Look
- * backwards to the preceding strong type, and go with that,
- * whichever it is. */
- for (size_t c = copen; c-- > 0 ;) {
- size_t i = ctx->irs[c];
- BidiType t = ctx->types[i];
- if (typeIsStrongOrNumber(t)) {
- t = t == L ? L : R; /* numbers count as R */
- return t;
- }
- }
- /* Fallback: if the preceding strong type was not found, go with
- * sos. */
- return ctx->sos;
- }
- static void reset_bracket_type(BidiContext *ctx, size_t c, BidiType t)
- {
- /* Final bullet point of rule N0: when we change the type of a
- * bracket, the same change applies to any contiguous sequence of
- * characters after it whose _original_ bidi type was NSM. */
- do {
- ctx->types[ctx->irs[c++]] = t;
- #ifndef REMOVE_FORMATTING_CHARS
- while (c < ctx->irslen && ctx->origTypes[ctx->irs[c]] == BN) {
- /* Section 5.2 adjustment: skip past BN in the process. */
- c++;
- }
- #endif
- } while (c < ctx->irslen && ctx->origTypes[ctx->irs[c]] == NSM);
- }
- static void resolve_brackets(BidiContext *ctx, size_t copen, size_t cclose)
- {
- if (typeIsNeutral(ctx->types[ctx->irs[copen]]) &&
- typeIsNeutral(ctx->types[ctx->irs[cclose]])) {
- BidiType t = get_bracket_type(ctx, copen, cclose);
- if (t != ON) {
- reset_bracket_type(ctx, copen, t);
- reset_bracket_type(ctx, cclose, t);
- }
- }
- }
- static void remove_ni(BidiContext *ctx)
- {
- /*
- * Rules N1 and N2 together: neutral or isolate characters take
- * the direction of the surrounding strong text if the nearest
- * strong characters on each side match, and otherwise, they take
- * the embedding direction.
- */
- const size_t NO_INDEX = ~(size_t)0;
- BidiType prevStrongType = ctx->sos;
- size_t c_ni_start = NO_INDEX;
- for (size_t c = 0; c <= ctx->irslen; c++) {
- BidiType t;
- if (c < ctx->irslen) {
- size_t i = ctx->irs[c];
- t = ctx->types[i];
- } else {
- /* One extra loop iteration, using eos to resolve the
- * final sequence of NI if any */
- t = ctx->eos;
- }
- if (typeIsStrongOrNumber(t)) {
- t = t == L ? L : R; /* numbers count as R */
- if (c_ni_start != NO_INDEX) {
- /* There are some NI we have to fix up */
- BidiType ni_type = (t == prevStrongType ? t :
- ctx->embeddingDirection);
- for (size_t c2 = c_ni_start; c2 < c; c2++) {
- size_t i2 = ctx->irs[c2];
- BidiType t2 = ctx->types[i2];
- if (typeIsNeutralOrIsolate(t2))
- ctx->types[i2] = ni_type;
- }
- }
- prevStrongType = t;
- c_ni_start = NO_INDEX;
- } else if (typeIsNeutralOrIsolate(t) && c_ni_start == NO_INDEX) {
- c_ni_start = c;
- }
- }
- }
- static void resolve_implicit_levels(BidiContext *ctx)
- {
- /* Rules I1 and I2 */
- for (size_t c = 0; c < ctx->irslen; c++) {
- size_t i = ctx->irs[c];
- unsigned char level = ctx->levels[i];
- BidiType t = ctx->types[i];
- if (level % 2 == 0) {
- /* Rule I1 */
- if (t == R)
- ctx->levels[i] += 1;
- else if (t == AN || t == EN)
- ctx->levels[i] += 2;
- } else {
- /* Rule I2 */
- if (t == L || t == AN || t == EN)
- ctx->levels[i] += 1;
- }
- }
- }
- static void process_isolating_run_sequence(BidiContext *ctx)
- {
- /* Section W: resolve weak types */
- remove_nsm(ctx);
- change_en_to_an(ctx);
- change_al_to_r(ctx);
- eliminate_separators_between_numbers(ctx);
- eliminate_et_next_to_en(ctx);
- eliminate_separators_and_terminators(ctx);
- change_en_to_l(ctx);
- /* Section N: resolve neutral types (and isolates) */
- find_bracket_pairs(ctx, resolve_brackets);
- remove_ni(ctx);
- /* Section I: resolve implicit levels */
- resolve_implicit_levels(ctx);
- }
- static void reset_whitespace_and_separators(BidiContext *ctx)
- {
- /*
- * Rule L1: segment and paragraph separators, plus whitespace
- * preceding them, all reset to the paragraph embedding level.
- * This also applies to whitespace at the very end.
- *
- * This is done using the original types, not the versions that
- * the rest of this algorithm has been merrily mutating.
- */
- bool modifying = true;
- for (size_t i = ctx->textlen; i-- > 0 ;) {
- BidiType t = ctx->origTypes[i];
- if (typeIsSegmentOrParaSeparator(t)) {
- ctx->levels[i] = ctx->paragraphLevel;
- modifying = true;
- } else if (modifying) {
- if (typeIsWhitespaceOrIsolate(t)) {
- ctx->levels[i] = ctx->paragraphLevel;
- } else if (!typeIsRemovedDuringProcessing(t)) {
- modifying = false;
- }
- }
- }
- #ifndef REMOVE_FORMATTING_CHARS
- /*
- * Section 5.2 adjustment: types removed by rule X9 take the level
- * of the character to their left.
- */
- for (size_t i = 0; i < ctx->textlen; i++) {
- BidiType t = ctx->origTypes[i];
- if (typeIsRemovedDuringProcessing(t)) {
- /* Section 5.2 adjustment */
- ctx->levels[i] = (i > 0 ? ctx->levels[i-1] : ctx->paragraphLevel);
- }
- }
- #endif /* ! REMOVE_FORMATTING_CHARS */
- }
- static void reverse(BidiContext *ctx, size_t start, size_t end)
- {
- for (size_t i = start, j = end; i < j; i++, j--) {
- bidi_char tmp = ctx->text[i];
- ctx->text[i] = ctx->text[j];
- ctx->text[j] = tmp;
- }
- }
- static void mirror_glyphs(BidiContext *ctx)
- {
- /*
- * Rule L3: any character with a mirror-image pair at an odd
- * embedding level is replaced by its mirror image.
- *
- * This is specified in the standard as happening _after_ rule L2
- * (the actual reordering of the text). But it's much easier to
- * implement it before, while our levels[] array still matches up
- * to the text order.
- */
- for (size_t i = 0; i < ctx->textlen; i++) {
- if (ctx->levels[i] % 2)
- ctx->text[i].wc = mirror_glyph(ctx->text[i].wc);
- }
- }
- static void reverse_sequences(BidiContext *ctx)
- {
- /*
- * Rule L2: every maximal contiguous sequence of characters at a
- * given level or higher is reversed.
- */
- unsigned level = 0;
- for (size_t i = 0; i < ctx->textlen; i++)
- level = max(level, ctx->levels[i]);
- for (; level >= 1; level--) {
- for (size_t i = 0; i < ctx->textlen; i++) {
- if (ctx->levels[i] >= level) {
- size_t start = i;
- while (i+1 < ctx->textlen && ctx->levels[i+1] >= level)
- i++;
- reverse(ctx, start, i);
- }
- }
- }
- }
- /*
- * The Main Bidi Function. The two wrappers below it present different
- * external APIs for different purposes, but everything comes through
- * here.
- *
- * text: a buffer of size textlen containing text to apply the
- * Bidirectional algorithm to.
- */
- static void do_bidi_new(BidiContext *ctx, bidi_char *text, size_t textlen)
- {
- ensure_arrays(ctx, textlen);
- ctx->text = text;
- ctx->textlen = textlen;
- setup_types(ctx);
- /* Quick initial test: see if we need to bother with any work at all */
- if (!text_needs_bidi(ctx))
- return;
- set_paragraph_level(ctx);
- process_explicit_embeddings(ctx);
- remove_embedding_characters(ctx);
- find_isolating_run_sequences(ctx, process_isolating_run_sequence);
- /* If this implementation distinguished paragraphs from lines,
- * then this would be the point where we repeat the remainder of
- * the algorithm once for each line in the paragraph. */
- reset_whitespace_and_separators(ctx);
- mirror_glyphs(ctx);
- reverse_sequences(ctx);
- }
- size_t do_bidi_test(BidiContext *ctx, bidi_char *text, size_t textlen,
- int override)
- {
- ctx->paragraphOverride = (override > 0 ? L : override < 0 ? R : ON);
- do_bidi_new(ctx, text, textlen);
- return ctx->textlen;
- }
- void do_bidi(BidiContext *ctx, bidi_char *text, size_t textlen)
- {
- #ifdef REMOVE_FORMATTING_CHARACTERS
- abort(); /* can't use the standard algorithm in a live terminal */
- #else
- ctx->paragraphOverride = ON;
- do_bidi_new(ctx, text, textlen);
- #endif
- }
|