br.ml 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. let make_identifier src (t, tz_s) =
  2. let sta =
  3. if "/radio/bayern1/" = String.sub src 17 15 then "b1"
  4. else if "/radio/bayern2/" = String.sub src 17 15 then "b2"
  5. else if "/radio/br-heimat/" = String.sub src 17 17 then "brheimat"
  6. else if "/radio/br-schlager/" = String.sub src 17 19 then "b+"
  7. else if "/radio/br24/" = String.sub src 17 12 then "b5"
  8. else if "/puls/programm/puls-radio/" = String.sub src 17 26 then "puls"
  9. else " ? "
  10. and rx =
  11. "([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2})" |> Re.Pcre.regexp
  12. in
  13. let m = t |> Ptime.to_rfc3339 ~tz_offset_s:tz_s |> Re.exec rx in
  14. let g = Re.Group.get m in
  15. sta ^ "/" ^ g 1 ^ "/" ^ g 2 ^ "/" ^ g 3 ^ "/" ^ g 4 ^ g 5
  16. open Soup
  17. let scrape_str str =
  18. let scrape_soup soup : Broadcast.t =
  19. let module Td = Timedesc in
  20. let tz = "Europe/Zurich" |> Td.Time_zone.make_exn
  21. and base = "https://www.br.de"
  22. and of_rfc3339 (str : string) : Broadcast.timestamp =
  23. match str |> Ptime.of_rfc3339 with
  24. | Ok (t, Some tz_s, _) -> (t, tz_s)
  25. | _ -> (Ptime.min, 0)
  26. and rx =
  27. "livestreamBeginTime:'([0-9]+)[0-9]{3}', \
  28. livestreamEndTime:'([0-9]+)[0-9]{3}'" |> Re.Pcre.regexp
  29. in
  30. let source = soup $ "body input#fieldLink" |> R.attribute "value"
  31. and subject =
  32. try
  33. let url = soup $ "body a.media_broadcastSeries" |> R.attribute "href" in
  34. (* starts_with requires a very recent ocaml, so we do without *)
  35. (if String.rcontains_from url 0 '/' then base else "") ^ url
  36. with _ -> ""
  37. and sched (str : string) : Broadcast.timestamp * Broadcast.timestamp =
  38. (* extract timestart and timeend *)
  39. let m = str |> Re.exec rx in
  40. let of_epoch idx =
  41. let s = idx |> Re.Group.get m |> Float.of_string in
  42. (* thanks https://discuss.ocaml.org/t/adding-timezone-to-utc-epoch-seconds/8565/7?u=mro *)
  43. let time = s |> Td.of_timestamp_float_s_exn ~tz_of_date_time:tz in
  44. let offset =
  45. match Td.offset_from_utc time with
  46. | `Single x -> x |> Td.Span.get_s |> Int64.to_int
  47. | `Ambiguous _ ->
  48. failwith
  49. ("Unexpected case getting timezone offset with utc epoch " ^ str)
  50. in
  51. let ptime =
  52. time |> Td.to_timestamp_single |> Td.Utils.ptime_of_timestamp
  53. |> Option.get
  54. in
  55. (ptime, offset)
  56. in
  57. (of_epoch 1, of_epoch 2)
  58. in
  59. let timestart, timeend =
  60. soup $ "body div.livestream_box" |> R.attribute "class" |> sched
  61. and fkt s =
  62. try
  63. let n = soup $ "body " ^ s in
  64. delete n;
  65. n |> R.leaf_text |> String.trim
  66. with _ -> ""
  67. in
  68. let title_series = fkt ".bcast_overline"
  69. and title_episode = fkt ".bcast_subtitle"
  70. and title = fkt ".bcast_headline"
  71. and make_desc n =
  72. n |> to_list
  73. |> List.map (fun a -> a |> texts)
  74. |> List.flatten |> String.concat " " |> String.trim
  75. and meta k = soup $ "head > meta[" ^ k ^ "]" |> R.attribute "content" in
  76. {
  77. author = meta "name=DCTERMS.creator";
  78. description = soup $$ "html > body .copytext" |> make_desc;
  79. identifier = make_identifier source timestart;
  80. image = meta "property=og:image";
  81. language = soup $ "html" |> R.attribute "lang";
  82. modified = meta "property=og:article:modified_time" |> of_rfc3339;
  83. source;
  84. subject;
  85. timeend;
  86. timestart;
  87. title;
  88. title_episode;
  89. title_series;
  90. }
  91. in
  92. str |> parse |> scrape_soup
  93. let scrape cin : Broadcast.t = cin |> read_channel |> scrape_str