mro
/
seppo


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
							(*
 *    _  _   ____                         _  
 *  _| || |_/ ___|  ___ _ __  _ __   ___ | | 
 * |_  ..  _\___ \ / _ \ '_ \| '_ \ / _ \| | 
 * |_      _|___) |  __/ |_) | |_) | (_) |_| 
 *   |_||_| |____/ \___| .__/| .__/ \___/(_) 
 *                     |_|   |_|             
 *
 * Personal Social Web.
 *
 * Copyright (C) The #Seppo contributors. All rights reserved.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

(* https://www.unicode.org/reports/tr31/#D2 *)
type tag = Tag of string
type state = Ready | Continue | Medial | Ignore

(*
 * https://www.unicode.org/reports/tr31/#D2
 *)
let of_string s =
  let is_start uc =
    match Uchar.to_int uc with
    | 0x0023 (* '#' *)
      (* | 0xFE5F | 0xFF03 *) -> true
    | _ -> false
  in
  let is_continue uc =
    (Uucp.Id.is_xid_continue uc
     (* || Uucp.Emoji.is_extended_pictographic uc
        || Uucp.Emoji.is_emoji_component uc *)
     ||
     match Uchar.to_int uc with
     | 0x002B (* '+' *)
     | 0x002D (* '-' *)
     | 0x005F (* '_' *)
     | 0x200D (* zero width joiner *) ->
       true
     | _ -> false)
    && not (is_start uc)
  and is_medial uc =
    (* https://www.unicode.org/reports/tr31/#Table_Optional_Medial *)
    match Uchar.to_int uc with
    | 0x0040 (* '@' *) | 0x00A7 (* '§' *) | 0x2020 (* '†' *) -> true
    | _ -> false
  in
  let b = Buffer.create 42 in
  let flush_segment acc =
    let segment = Buffer.contents b in
    Buffer.clear b;
    if segment = "" || segment = "#" 
    then acc 
    else Tag segment :: acc
  and buffer u acc =
    Uutf.Buffer.add_utf_8 b u;
    acc
  in
  let each_uchar (st, acc) _ = function
    | `Malformed _ -> (st, acc)
    | `Uchar u -> (
        match st with
        | Ready ->
          if
            is_start u
            (* start with emoji even without prior # *)
            (* || Uucp.Emoji.is_extended_pictographic u *)
          then (Continue, buffer u acc)
          else if is_continue u then (Ignore, acc)
          else (Ready, acc)
        | Continue ->
          if is_continue u then (Continue, buffer u acc)
          else if is_medial u then (Medial, buffer u acc)
          else (Ready, flush_segment acc)
        | Medial ->
          if is_continue u then (Continue, buffer u acc)
          else (Ready, flush_segment acc)
        | Ignore -> if is_continue u then (Ignore, acc) else (Ready, acc))
  in
  let _, ret = Uutf.String.fold_utf_8 each_uchar (Ready, []) s in
  flush_segment ret |> List.rev

(* https://codeberg.org/mro/ShaarliGo/src/branch/master/tags.go#L104 *)
let fold (Tag s) =
  (* https://erratique.ch/software/uunf/doc/Uunf/index.html#utf8 
   * https://erratique.ch/software/uutf/doc/Uutf/String/ *)
  let utf8_norm_filter pred nf s =
    let b = Buffer.create (String.length s * 3) in
    let n = Uunf.create nf in
    let rec add v =
      match Uunf.add n v with
      | `Uchar u ->
        if pred u then Uutf.Buffer.add_utf_8 b u;
        add `Await
      | `Await | `End -> ()
    in
    let add_uchar (_ : unit) (_ : int) = function
      | `Malformed _ -> add (`Uchar Uutf.u_rep)
      | `Uchar _ as u -> add u
    in
    Uutf.String.fold_utf_8 add_uchar () s;
    add `End;
    Buffer.contents b
  in
  s
  |> utf8_norm_filter (fun u -> `Mn != Uucp.Gc.general_category u) `NFD
  |> Uunf_string.normalize_utf_8 `NFC
  |> String.lowercase_ascii

let diff cmp a_srt b_srt =
  let rec f a b (same, plus, minus) =
    match (a, b) with
    | [], _ -> (same, plus |> List.rev_append b, minus)
    | _, [] -> (same, plus, minus |> List.rev_append a)
    | ah :: at, bh :: bt ->
      let cm = cmp ah bh in
      if cm < 0 then f at b (same, plus, ah :: minus)
      else if cm > 0 then f a bt (same, bh :: plus, minus)
      else f at bt (ah :: same, plus, minus)
  in
  let r0, r1, r2 = f a_srt b_srt ([], [], []) in
  (r0 |> List.rev, r1 |> List.rev, r2 |> List.rev)

(* LUT for folded keys -> label writing *)
module Tmap = Map.Make (String)

let add_tag v m =
  let k = fold v in
  match Tmap.find_opt k m with
  (* add only if not already there *)
  | None -> Tmap.add k v m
  | Some _ -> m

let add_tag_list (v : tag list) m =
  let fkt m t = add_tag t m in
  List.fold_left fkt m v

let add_tag_seq (v : tag Seq.t) m =
  let fkt m t = add_tag t m in
  Seq.fold_left fkt m v

(* Find all tags in their existing spelling and append to the body if necessary.
 *
 * Data:
 * - title (line)
 * - body (multiline)
 * - tags list
 * - lookup evtl. existing Tag -> Tag with 'fold' equality or add it
 *
 * https://codeberg.org/mro/ShaarliGo/src/branch/master/tags.go#L124
 * https://discuss.ocaml.org/t/associative-stuff-ocaml-api/9870/3?u=mro
*)
let normalise0 short long tags lut f_add f_find : string * string * tag list =
  let txt = short |> of_string |> List.rev_append (long |> of_string) in
  let lut = lut |> f_add tags |> f_add txt in
  let luf v = lut |> f_find (fold v) in
  let tags = tags |> List.rev_map luf
  and txt = txt |> List.rev_map luf
  and tcmp (Tag a) (Tag b) = String.compare a b in
  let tsrt = List.sort_uniq tcmp in
  let _same, plus, minus = txt |> tsrt |> diff tcmp (tags |> tsrt) in
  let long =
    match minus with
    | [] -> long
    | ls ->
      long ^ "\n" ^ (ls |> List.map (fun (Tag t) -> t) |> String.concat " ")
  in
  (short, long, tags |> List.rev_append plus |> tsrt)

let normalise short long tags (lut : tag Tmap.t) : string * string * tag list =
  normalise0 short long tags lut add_tag_list Tmap.find

let slurp_channel ic =
  let chunk = 4 * 0x400 in
  let b = Buffer.create chunk in
  (try Buffer.add_channel b ic chunk with End_of_file -> ());
  b |> Buffer.to_bytes |> Bytes.to_string

let sift_channel ic = Ok (ic |> slurp_channel |> of_string)

let cdb = Mapcdb.Cdb "app/var/cache/tags.cdb"

(** use a cdb as a backing map (store).
 *
 * Mapcdb cannot satisfy a Map.Make (String) yet
*)
let cdb_normalise short long tags (lut : Mapcdb.cdb) =
  let f_add (v : tag list) lut =
    let keep _ = true in
    let fkt_add_all (add1 : ((bytes*bytes) -> unit)) =
      let _added = v |>  List.fold_left (fun lut' item -> 
          let k = item |> fold |> Bytes.of_string in
          (match Mapcdb.find_opt k lut' with
           | None -> 
             let (Tag v) = item in
             add1 (k,v |> Bytes.of_string);
           | Some _ -> ());
          lut'
        ) lut in
      ()
    in
    Mapcdb.add_many keep fkt_add_all lut
  in
  let f_find s lut = 
    let s' = (Tag s) |> fold in
    Tag (match Mapcdb.find_string_opt s' lut with
        | None   -> s
        | Some s -> s)
  in
  normalise0 short long tags lut f_add f_find