123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223 |
- (*
- * _ _ ____ _
- * _| || |_/ ___| ___ _ __ _ __ ___ | |
- * |_ .. _\___ \ / _ \ '_ \| '_ \ / _ \| |
- * |_ _|___) | __/ |_) | |_) | (_) |_|
- * |_||_| |____/ \___| .__/| .__/ \___/(_)
- * |_| |_|
- *
- * Personal Social Web.
- *
- * Copyright (C) The #Seppo contributors. All rights reserved.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *)
- (* https://www.unicode.org/reports/tr31/#D2 *)
- type tag = Tag of string
- type state = Ready | Continue | Medial | Ignore
- (*
- * https://www.unicode.org/reports/tr31/#D2
- *)
- let of_string s =
- let is_start uc =
- match Uchar.to_int uc with
- | 0x0023 (* '#' *)
- (* | 0xFE5F | 0xFF03 *) -> true
- | _ -> false
- in
- let is_continue uc =
- (Uucp.Id.is_xid_continue uc
- (* || Uucp.Emoji.is_extended_pictographic uc
- || Uucp.Emoji.is_emoji_component uc *)
- ||
- match Uchar.to_int uc with
- | 0x002B (* '+' *)
- | 0x002D (* '-' *)
- | 0x005F (* '_' *)
- | 0x200D (* zero width joiner *) ->
- true
- | _ -> false)
- && not (is_start uc)
- and is_medial uc =
- (* https://www.unicode.org/reports/tr31/#Table_Optional_Medial *)
- match Uchar.to_int uc with
- | 0x0040 (* '@' *) | 0x00A7 (* '§' *) | 0x2020 (* '†' *) -> true
- | _ -> false
- in
- let b = Buffer.create 42 in
- let flush_segment acc =
- let segment = Buffer.contents b in
- Buffer.clear b;
- if segment = "" || segment = "#"
- then acc
- else Tag segment :: acc
- and buffer u acc =
- Uutf.Buffer.add_utf_8 b u;
- acc
- in
- let each_uchar (st, acc) _ = function
- | `Malformed _ -> (st, acc)
- | `Uchar u -> (
- match st with
- | Ready ->
- if
- is_start u
- (* start with emoji even without prior # *)
- (* || Uucp.Emoji.is_extended_pictographic u *)
- then (Continue, buffer u acc)
- else if is_continue u then (Ignore, acc)
- else (Ready, acc)
- | Continue ->
- if is_continue u then (Continue, buffer u acc)
- else if is_medial u then (Medial, buffer u acc)
- else (Ready, flush_segment acc)
- | Medial ->
- if is_continue u then (Continue, buffer u acc)
- else (Ready, flush_segment acc)
- | Ignore -> if is_continue u then (Ignore, acc) else (Ready, acc))
- in
- let _, ret = Uutf.String.fold_utf_8 each_uchar (Ready, []) s in
- flush_segment ret |> List.rev
- (* https://codeberg.org/mro/ShaarliGo/src/branch/master/tags.go#L104 *)
- let fold (Tag s) =
- (* https://erratique.ch/software/uunf/doc/Uunf/index.html#utf8
- * https://erratique.ch/software/uutf/doc/Uutf/String/ *)
- let utf8_norm_filter pred nf s =
- let b = Buffer.create (String.length s * 3) in
- let n = Uunf.create nf in
- let rec add v =
- match Uunf.add n v with
- | `Uchar u ->
- if pred u then Uutf.Buffer.add_utf_8 b u;
- add `Await
- | `Await | `End -> ()
- in
- let add_uchar (_ : unit) (_ : int) = function
- | `Malformed _ -> add (`Uchar Uutf.u_rep)
- | `Uchar _ as u -> add u
- in
- Uutf.String.fold_utf_8 add_uchar () s;
- add `End;
- Buffer.contents b
- in
- s
- |> utf8_norm_filter (fun u -> `Mn != Uucp.Gc.general_category u) `NFD
- |> Uunf_string.normalize_utf_8 `NFC
- |> String.lowercase_ascii
- let diff cmp a_srt b_srt =
- let rec f a b (same, plus, minus) =
- match (a, b) with
- | [], _ -> (same, plus |> List.rev_append b, minus)
- | _, [] -> (same, plus, minus |> List.rev_append a)
- | ah :: at, bh :: bt ->
- let cm = cmp ah bh in
- if cm < 0 then f at b (same, plus, ah :: minus)
- else if cm > 0 then f a bt (same, bh :: plus, minus)
- else f at bt (ah :: same, plus, minus)
- in
- let r0, r1, r2 = f a_srt b_srt ([], [], []) in
- (r0 |> List.rev, r1 |> List.rev, r2 |> List.rev)
- (* LUT for folded keys -> label writing *)
- module Tmap = Map.Make (String)
- let add_tag v m =
- let k = fold v in
- match Tmap.find_opt k m with
- (* add only if not already there *)
- | None -> Tmap.add k v m
- | Some _ -> m
- let add_tag_list (v : tag list) m =
- let fkt m t = add_tag t m in
- List.fold_left fkt m v
- let add_tag_seq (v : tag Seq.t) m =
- let fkt m t = add_tag t m in
- Seq.fold_left fkt m v
- (* Find all tags in their existing spelling and append to the body if necessary.
- *
- * Data:
- * - title (line)
- * - body (multiline)
- * - tags list
- * - lookup evtl. existing Tag -> Tag with 'fold' equality or add it
- *
- * https://codeberg.org/mro/ShaarliGo/src/branch/master/tags.go#L124
- * https://discuss.ocaml.org/t/associative-stuff-ocaml-api/9870/3?u=mro
- *)
- let normalise0 short long tags lut f_add f_find : string * string * tag list =
- let txt = short |> of_string |> List.rev_append (long |> of_string) in
- let lut = lut |> f_add tags |> f_add txt in
- let luf v = lut |> f_find (fold v) in
- let tags = tags |> List.rev_map luf
- and txt = txt |> List.rev_map luf
- and tcmp (Tag a) (Tag b) = String.compare a b in
- let tsrt = List.sort_uniq tcmp in
- let _same, plus, minus = txt |> tsrt |> diff tcmp (tags |> tsrt) in
- let long =
- match minus with
- | [] -> long
- | ls ->
- long ^ "\n" ^ (ls |> List.map (fun (Tag t) -> t) |> String.concat " ")
- in
- (short, long, tags |> List.rev_append plus |> tsrt)
- let normalise short long tags (lut : tag Tmap.t) : string * string * tag list =
- normalise0 short long tags lut add_tag_list Tmap.find
- let slurp_channel ic =
- let chunk = 4 * 0x400 in
- let b = Buffer.create chunk in
- (try Buffer.add_channel b ic chunk with End_of_file -> ());
- b |> Buffer.to_bytes |> Bytes.to_string
- let sift_channel ic = Ok (ic |> slurp_channel |> of_string)
- let cdb = Mapcdb.Cdb "app/var/cache/tags.cdb"
- (** use a cdb as a backing map (store).
- *
- * Mapcdb cannot satisfy a Map.Make (String) yet
- *)
- let cdb_normalise short long tags (lut : Mapcdb.cdb) =
- let f_add (v : tag list) lut =
- let keep _ = true in
- let fkt_add_all (add1 : ((bytes*bytes) -> unit)) =
- let _added = v |> List.fold_left (fun lut' item ->
- let k = item |> fold |> Bytes.of_string in
- (match Mapcdb.find_opt k lut' with
- | None ->
- let (Tag v) = item in
- add1 (k,v |> Bytes.of_string);
- | Some _ -> ());
- lut'
- ) lut in
- ()
- in
- Mapcdb.add_many keep fkt_add_all lut
- in
- let f_find s lut =
- let s' = (Tag s) |> fold in
- Tag (match Mapcdb.find_string_opt s' lut with
- | None -> s
- | Some s -> s)
- in
- normalise0 short long tags lut f_add f_find
|