unheck-html.rkt 4.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. #lang racket
  2. (provide unheck-html unheck-all-html)
  3. (require (prefix-in h: html)
  4. (prefix-in x: xml))
  5. ;; copied from somewhere tbh
  6. (define alpha-codes [list "apos" "nbsp" "iexcl" "cent" "pound" "curren" "yen" "brvbar" "sect" "uml" "copy" "ordf" "laquo" "not" "shy" "reg" "macr" "deg" "plusmn" "sup2" "sup3" "acute" "micro" "para" "middot" "cedil" "sup1" "ordm" "raquo" "frac14" "frac12" "frac34" "iquest" "Agrave" "Aacute" "Acirc" "Atilde" "Auml" "Aring" "Aelig" "Ccedil" "Egrave" "Eacute" "Ecirc" "Euml" "Igrave" "Iacute" "Icirc" "Iuml" "ETH" "Ntilde" "Ograve" "Oacute" "Ocirc" "Otilde" "Ouml" "times" "Oslash" "Ugrave" "Uacute" "Ucirc" "Uuml" "Yacute" "THORN" "szlig" "agrave" "aacute" "acirc" "atilde" "auml" "aring" "aelig" "ccedil" "egrave" "eacute" "ecirc" "euml" "igrave" "iacute" "icirc" "iuml" "eth" "ntilde" "ograve" "oacute" "ocirc" "otilde" "ouml" "divide" "oslash" "ugrave" "uacute" "ucirc" "uuml" "yacute" "thorn" "yuml" "quot" "amp" "lt" "gt" "OElig" "oelig" "Scaron" "scaron" "Yuml" "circ" "tilde" "ensp" "emsp" "thinsp" "zwnj" "zwj" "lrm" "rlm" "ndash" "mdash" "lsquo" "rsquo" "sbquo" "ldquo" "rdquo" "bdquo" "dagger" "Dagger" "permil" "lsaquo" "rsaquo" "euro" "fnof" "Alpha" "Beta" "Gamma" "Delta" "Epsilon" "Zeta" "Eta" "Theta" "Iota" "Kappa" "Lambda" "Mu" "Nu" "Xi" "Omicron" "Pi" "Rho" "Sigma" "Tau" "Upsilon" "Phi" "Chi" "Psi" "Omega" "alpha" "beta" "gamma" "delta" "epsilon" "zeta" "eta" "theta" "iota" "kappa" "lambda" "mu" "nu" "xi" "omicron" "pi" "rho" "sigmaf" "sigma" "tau" "upsilon" "phi" "chi" "psi" "omega" "thetasym" "upsih" "piv" "bull" "hellip" "prime" "Prime" "oline" "frasl" "weierp" "image" "real" "trade" "alefsym" "larr" "uarr" "rarr" "darr" "harr" "crarr" "lArr" "uArr" "rArr" "dArr" "hArr" "forall" "part" "exist" "empty" "nabla" "isin" "notin" "ni" "prod" "sum" "minus" "lowast" "radic" "prop" "infin" "ang" "and" "or" "cap" "cup" "int" "there4" "sim" "cong" "asymp" "ne" "equiv" "le" "ge" "sub" "sup" "nsub" "sube" "supe" "oplus" "otimes" "perp" "sdot" "lceil" "rceil" "lfloor" "rfloor" "lang" "rang" "loz" "spades" "clubs" "hearts" "diams"])
  7. (define num-codes [list 39 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 34 38 60 62 338 339 352 353 376 710 732 8194 8195 8201 8204 8205 8206 8207 8211 8212 8216 8217 8218 8220 8221 8222 8224 8225 8240 8249 8250 8364 402 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 931 932 933 934 935 936 937 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 977 978 982 8226 8230 8242 8243 8254 8260 8472 8465 8476 8482 8501 8592 8593 8594 8595 8596 8629 8656 8657 8658 8659 8660 8704 8706 8707 8709 8711 8712 8713 8715 8719 8721 8722 8727 8730 8733 8734 8736 8743 8744 8745 8746 8747 8756 8764 8773 8776 8800 8801 8804 8805 8834 8835 8836 8838 8839 8853 8855 8869 8901 8968 8969 8970 8971 9001 9002 9674 9824 9827 9829 9830])
  8. ;; TODO make a macro so it's computed at compile time
  9. (define alpha-codes-hash
  10. (for/hash ([i num-codes]
  11. [a alpha-codes])
  12. (values a (integer->char i))))
  13. (define (unheck-html str)
  14. (define (unheck _ mtch)
  15. (cond
  16. [(equal? (string-ref mtch 0) #\#)
  17. (let* ([num (substring mtch 1)]
  18. [num (string->number num)])
  19. (if (integer? num)
  20. (make-string 1 (integer->char num))
  21. (string-append "&#" mtch)))
  22. ]
  23. [else
  24. (let ([char (hash-ref alpha-codes-hash mtch #f)])
  25. (if char
  26. (make-string 1 char)
  27. (string-append "&" mtch)))]))
  28. (regexp-replace* #px"&(#?[\\w\\d]+);?" str unheck))
  29. ;; TODO: use this instead of unheck-html
  30. (define (unheck-all-html str)
  31. (let* ([html (h:read-html (open-input-string str))]
  32. [contents (extract-pcdata html)])
  33. (string-join contents " ")))
  34. ;; copied from the html-lib example
  35. ; extract-pcdata: html-content/c -> (listof string)
  36. ; Pulls out the pcdata strings from some-content.
  37. (define (extract-pcdata some-content)
  38. (cond [(x:pcdata? some-content)
  39. (list (x:pcdata-string some-content))]
  40. [(x:entity? some-content)
  41. (list)]
  42. [else
  43. (extract-pcdata-from-element some-content)]))
  44. ; extract-pcdata-from-element: html-element -> (listof string)
  45. ; Pulls out the pcdata strings from an-html-element.
  46. (define (extract-pcdata-from-element an-html-element)
  47. (match an-html-element
  48. [(struct h:html-full (attributes content))
  49. (apply append (map extract-pcdata content))]
  50. [(struct h:html-element (attributes))
  51. '()]))