12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- let make_identifier src (t, tz_s) =
- let sta =
- if "/radio/bayern1/" = String.sub src 17 15 then "b1"
- else if "/radio/bayern2/" = String.sub src 17 15 then "b2"
- else if "/radio/br-heimat/" = String.sub src 17 17 then "brheimat"
- else if "/radio/br-schlager/" = String.sub src 17 19 then "b+"
- else if "/radio/br24/" = String.sub src 17 12 then "b5"
- else if "/puls/programm/puls-radio/" = String.sub src 17 26 then "puls"
- else " ? "
- and rx =
- "([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2})" |> Re.Pcre.regexp
- in
- let m = t |> Ptime.to_rfc3339 ~tz_offset_s:tz_s |> Re.exec rx in
- let g = Re.Group.get m in
- sta ^ "/" ^ g 1 ^ "/" ^ g 2 ^ "/" ^ g 3 ^ "/" ^ g 4 ^ g 5
- open Soup
- let scrape_str str =
- let scrape_soup soup : Broadcast.t =
- let module Td = Timedesc in
- let tz = "Europe/Zurich" |> Td.Time_zone.make_exn
- and base = "https://www.br.de"
- and of_rfc3339 (str : string) : Broadcast.timestamp =
- match str |> Ptime.of_rfc3339 with
- | Ok (t, Some tz_s, _) -> (t, tz_s)
- | _ -> (Ptime.min, 0)
- and rx =
- "livestreamBeginTime:'([0-9]+)[0-9]{3}', \
- livestreamEndTime:'([0-9]+)[0-9]{3}'" |> Re.Pcre.regexp
- in
- let source = soup $ "body input#fieldLink" |> R.attribute "value"
- and subject =
- try
- let url = soup $ "body a.media_broadcastSeries" |> R.attribute "href" in
- (* starts_with requires a very recent ocaml, so we do without *)
- (if String.rcontains_from url 0 '/' then base else "") ^ url
- with _ -> ""
- and sched (str : string) : Broadcast.timestamp * Broadcast.timestamp =
- (* extract timestart and timeend *)
- let m = str |> Re.exec rx in
- let of_epoch idx =
- let s = idx |> Re.Group.get m |> Float.of_string in
- (* thanks https://discuss.ocaml.org/t/adding-timezone-to-utc-epoch-seconds/8565/7?u=mro *)
- let time = s |> Td.of_timestamp_float_s_exn ~tz_of_date_time:tz in
- let offset =
- match Td.offset_from_utc time with
- | `Single x -> x |> Td.Span.get_s |> Int64.to_int
- | `Ambiguous _ ->
- failwith
- ("Unexpected case getting timezone offset with utc epoch " ^ str)
- in
- let ptime =
- time |> Td.to_timestamp_single |> Td.Utils.ptime_of_timestamp
- |> Option.get
- in
- (ptime, offset)
- in
- (of_epoch 1, of_epoch 2)
- in
- let timestart, timeend =
- soup $ "body div.livestream_box" |> R.attribute "class" |> sched
- and fkt s =
- try
- let n = soup $ "body " ^ s in
- delete n;
- n |> R.leaf_text |> String.trim
- with _ -> ""
- in
- let title_series = fkt ".bcast_overline"
- and title_episode = fkt ".bcast_subtitle"
- and title = fkt ".bcast_headline"
- and make_desc n =
- n |> to_list
- |> List.map (fun a -> a |> texts)
- |> List.flatten |> String.concat " " |> String.trim
- and meta k = soup $ "head > meta[" ^ k ^ "]" |> R.attribute "content" in
- {
- author = meta "name=DCTERMS.creator";
- description = soup $$ "html > body .copytext" |> make_desc;
- identifier = make_identifier source timestart;
- image = meta "property=og:image";
- language = soup $ "html" |> R.attribute "lang";
- modified = meta "property=og:article:modified_time" |> of_rfc3339;
- source;
- subject;
- timeend;
- timestart;
- title;
- title_episode;
- title_series;
- }
- in
- str |> parse |> scrape_soup
- let scrape cin : Broadcast.t = cin |> read_channel |> scrape_str
|