123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231 |
- #!/bin/sh
- # -*- scheme -*-
- exec guile -e main -s "$0" "$@"
- !#
- ;; Simple WoT crawler
- (use-modules (web request)
- (web client)
- (web response)
- (web uri)
- (web http)
- (ice-9 threads)
- (ice-9 vlist)
- (ice-9 rdelim)
- (ice-9 futures)
- (rnrs io ports)
- (ice-9 match)
- (srfi srfi-42)
- (srfi srfi-1)
- (rnrs bytevectors)
- (sxml simple)
- (sxml match))
- (define base-url "http://127.0.0.1:8888")
- (define seed-id "USK@QeTBVWTwBldfI-lrF~xf0nqFVDdQoSUghT~PvhyJ1NE,OjEywGD063La2H-IihD7iYtZm3rC0BP6UTvvwyF5Zh4,AQACAAE/WebOfTrust/1502")
- (define (furl uri)
- (string-append base-url uri "?forcedownload=true"))
- (define (furl-uri uri)
- (string-append base-url "/" uri "?forcedownload=true"))
- (define (get url)
- (let* ((u (string->uri url))
- (r (build-request u))
- (p (open-socket-for-uri u))
- (rr (write-request r p))
- (rp (request-port rr)))
- (force-output p)
- (declare-opaque-header! "Location")
- ;(while (write (read-line p))
- ; (newline))
- (let ((resp (read-response rp)))
- (let ((c (response-code resp))
- (h (response-headers resp))
- (b (read-response-body resp)))
- (cond
- ((= c 301)
- (get (furl (assoc-ref h 'location))))
- ((= c 200)
- (cond
- ((equal? '(text/html (charset . "utf-8")) (assoc-ref h 'content-type))
- (utf8->string b))
- ((equal? '(application/force-download) (assoc-ref h 'content-type))
- (utf8->string b))
- (else (assoc-ref h 'content-type))))
- (else c))))))
- (define (non-breaking-sxml-reader xml-port)
- (catch #t
- (lambda () (xml->sxml xml-port))
- (lambda (key . args) (format #t "~A: ~A" key args)(newline) '())))
- (define (snarf-wot-ids xml-port)
- (let ((sxml (non-breaking-sxml-reader xml-port)))
- (let ((uris '()))
- (let grab-uris ((sxml sxml))
- (match sxml
- (('Identity uri) (set! uris (cons uri uris)))
- ((a b ...)
- (map grab-uris sxml))
- (else '())))
- uris)))
- (define (wot-uri-key uri)
- (let ((index (string-index uri #\/)))
- (if index
- (string-take uri index)
- uri))) ;; no / in uri, so it is already a key.
- (define (wot-uri-filename uri)
- (let ((u (if (string-prefix? "freenet:" uri)
- (substring uri 8)
- uri)))
- (string-join (string-split u #\/) "-")))
- (define (dump-wot-id uri filename)
- (let ((u (if (string-prefix? "freenet:" uri)
- (substring uri 8)
- uri)))
- (format #t "Download to file ~A\n" filename)
- (if (string-prefix? "USK@" u)
- (let ((data (get (furl-uri u))))
- (if (string? data)
- (let ((port (open-output-file filename)))
- (put-string port data)
- (close-port port))
- (error (format #t "tried to save in file ~A\n" filename))))
- (error (format #t "tried to save in file ~A\n" filename)))))
- (define (flatten l)
- "Flatten a nested list into a single list."
- (cond ((null? l) '())
- ((list? l) (append (flatten (car l)) (flatten (cdr l))))
- (else (list l))))
- (define* (crawl-wot seed-id #:key (redownload #f))
- ;; TODO: add (flatten ...) with Guile 2.1.x (currently it gives a stack overflow)
- (let ((known '()))
- (let crawl ((seed seed-id))
- ;; save the data
- (if (catch 'misc-error
- (lambda () (let* ((filename (wot-uri-filename seed))
- (dump (lambda () (dump-wot-id seed filename))))
- (if (and (not redownload) (file-exists? filename))
- (let* ((s (stat filename))
- (size (stat:size s)))
- (if (= size 0)
- (dump)
- (format #t "Use local copy of file ~A (redownload ~A).\n" filename redownload)))
- (dump))
- #f))
- (lambda (key . args) #t))
- known
- ;; snarf all uris
- (let ((uris (call-with-input-file (wot-uri-filename seed) snarf-wot-ids)))
- ;; (write seed)(newline)
- ;; (when (not (null? uris))
- ;; (write (car uris))(newline))
- (let ((new (list-ec (: u uris) (if (and
- (not (pair? u)) ; TODO: this is a hack. I do not know why u can be the full sxml. Seems to happen with IDs who do not have any trust set.
- (not (member (wot-uri-key u) known)))) u)))
- (when (not (null? new))
- (display "new: ")
- (write (car new))(newline))
- (when (not (null? known))
- (display "known: ")
- (write (car known))(newline)(write (length known))(newline))
- (set! known (lset-union equal?
- (list-ec (: u new) (wot-uri-key u))
- known))
- (if (null? new)
- known
- (lset-union equal? known (par-map crawl new)))))))))
- (define (parse-datehint str)
- (let ((lines (string-split str #\newline)))
- `((version . ,(list-ref lines 1))
- (date . ,(list-ref lines 2)))))
- (define* (datehint-for-key key year #:key (sitename "WebOfTrust") (week #f))
- (string-append "SSK" (substring key 3)
- "/" sitename
- "-" "DATEHINT"
- "-" (number->string year)
- (if week (string-append "-WEEK-" (number->string week)) "")))
-
- (define (furl-key-name-version key name version)
- "Get a freenet URL for the key and the version"
- (furl-uri (string-append "SSK" (substring key 3) "/" name "-" version)))
- (define (download-by-weekly-date-hint uri year week)
- (let* ((weekuri (datehint-for-key (wot-uri-key uri) year #:week week))
- (hint (get (furl-uri weekuri))))
- (if (not (string? hint))
- #f
- (let* ((hint-alist (parse-datehint hint))
- (version (assoc-ref hint-alist 'version))
- (date (assoc-ref hint-alist 'date))
- (url (furl-key-name-version (wot-uri-key uri) "WebOfTrust" version))
- (filename (string-append date "/" (wot-uri-key uri) "-" version)))
- (when (not (file-exists? date))
- (mkdir date))
- (format #t "download to: ~A | for week ~A\n" filename week)
- (let ((data (get url)))
- (when (string? data)
- (let ((port (open-output-file filename)))
- (put-string port data)
- (close-port port))))
- filename))))
- (define (download-by-date-hint uri)
- "Download all versions of the ID, ordered by the week in the DATEHINT."
- ;; An uri looks like this: USK@QWW2a74OWrtN-aWJ80fjWhfFx8NlNrlU0dQfd3J7t1E,2g-wfM57Up9DV1qoEDMPcDU-KPskk0yyiYFz67ydSos,AQACAAE
- ;; A date hint for WoT looks like this: SSK@QWW2a74OWrtN-aWJ80fjWhfFx8NlNrlU0dQfd3J7t1E,2g-wfM57Up9DV1qoEDMPcDU-KPskk0yyiYFz67ydSos,AQACAAE-WebOfTrust-DATEHINT-2015
- ;; or
- ;; SSK@[key]/[sitename]-DATEHINT-[year]
- ;; SSK@[key]/[sitename]-DATEHINT-[year]-WEEK-[week]
- ;; SSK@[key]/[sitename]-DATEHINT-[year]-[month]
- ;; SSK@[key]/[sitename]-DATEHINT-[year]-[month]-[day]
- ;; see http://draketo.de/light/english/freenet/usk-and-date-hints
- ;; Approach: First check whether the ID has a date hint for each year. Then check each weak in the matching years.
- ;; download the versions into directories ordered as YEAR-month-day/SSK@...-WebOfTrust-version
- (let ((years (iota 10 2016 -1)) ; last 10 years
- (weeks (iota 52 1))) ; 1-52
- (delete #f ;; only return the filenames of successful downloads
- (par-map (lambda (year)
- (let* ((yearuri (datehint-for-key (wot-uri-key uri) year))
- (hint (get (furl-uri yearuri))))
- (if (not (string? hint))
- #f
- (let* ((hint-alist (parse-datehint hint))
- (date (assoc-ref hint-alist 'date))
- (month (string->number (list-ref (string-split date #\-) 1)))
- (min-week (* month 4))) ; avoid trying to download weeks which cannot be available.
- (format #t "Downloading key ~a starting in month ~a for weeks ~a to 52\n" yearuri month min-week)
- (delete #f ;; only return the filenames of successful downloads
- (n-par-map 10 (lambda (week)
- (if (< week min-week) ; avoid weeks earlier than the date in the yearly date hint
- #f
- (download-by-weekly-date-hint uri year week)))
- weeks))))))
- years))))
- (define (main args)
- (let ((seed-id (if (null? (cdr args))
- seed-id
- (car (cdr args)))))
- (let ((seed (if (string-index seed-id #\/)
- seed-id
- (string-append "USK" (string-drop seed-id 3) "/WebOfTrust/-1")))) ; -1 can also return 0
- ;; (crawl-wot seed))))
- ;; (write (download-by-date-hint seed)))))
- (par-map download-by-date-hint
- (crawl-wot seed)))))
|