123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- #!/usr/bin/env python3
- import requests
- import time
- from rdflib import BNode, Graph, Literal, Namespace, URIRef
- from rdflib.namespace import RDF
- from string import Template
- dokkg = Graph ()
- SCHEMA = Namespace ('http://schema.org/')
- dokkg.bind ('schema', SCHEMA)
- def query (query_string, limit=100000, offset=0):
- """Query the SPARQL endpoint."""
-
- return requests.post (
- 'http://localhost:3030/dokk/sparql',
- timeout = 60,
- data = {
- 'format': 'json',
- 'query': Template (query_string).substitute ({ 'limit': limit, 'offset': offset })
- }).json ()
- QUERY_MANPAGES = """
- PREFIX graph: <graphdata:name:>
- PREFIX manpage-terms: <urn:manpage:terms:>
- PREFIX manpage: <urn:manpage:>
- PREFIX schema: <http://schema.org/>
-
- SELECT ?name ?description
- FROM graph:manpages
- WHERE
- {
- [] a schema:TextDigitalDocument ;
- schema:identifier ?name ;
- schema:disambiguatingDescription ?description .
- }
- LIMIT $limit
- OFFSET $offset
- """
- # Some licenses have more than one name, so we select only one using MIN
- # which selects the string with capital letters, eg. "BSD 2-Clause" instead
- # of "BSD 2-clause"
- QUERY_SPDX = """
- PREFIX graph: <graphdata:name:>
- PREFIX schema: <http://schema.org/>
- PREFIX spdx: <http://spdx.org/rdf/terms#>
- SELECT ?id (MIN (?_name) AS ?name)
- FROM graph:spdx
- WHERE
- {
- ?license a spdx:License ;
- spdx:licenseId ?id ;
- spdx:name ?_name .
-
- FILTER NOT EXISTS { ?license spdx:isDeprecatedLicenseId [] }
- }
- GROUP BY ?id
- LIMIT $limit
- OFFSET $offset
- """
- limit = 50000
- offset = 0
- while True:
- data = query (QUERY_MANPAGES, limit=limit, offset=offset)
-
- if len (data['results']['bindings']) == 0:
- break
-
- data = data['results']['bindings']
-
- for item in data:
- thing = URIRef ('https://dokk.org/manpages/' + item['name']['value'])
-
- dokkg.add ((thing,
- RDF.type,
- SCHEMA.Thing))
- dokkg.add ((thing,
- SCHEMA.name,
- Literal (item['name']['value'])))
- dokkg.add ((thing,
- SCHEMA.description,
- Literal (item['description']['value'].capitalize ())))
- dokkg.add ((thing,
- SCHEMA.url,
- Literal ('/manpages/' + item['name']['value'])))
-
- offset += limit
- limit = 50000
- offset = 0
- while True:
- data = query (QUERY_SPDX, limit=limit, offset=offset)
-
- if len (data['results']['bindings']) == 0:
- break
-
- data = data['results']['bindings']
-
- for item in data:
- thing = URIRef ('https://dokk.org/licenses/' + item['id']['value'])
-
- dokkg.add ((thing,
- RDF.type,
- SCHEMA.Thing))
- dokkg.add ((thing,
- SCHEMA.name,
- Literal (item['name']['value'])))
- dokkg.add ((thing,
- SCHEMA.description,
- Literal (item['id']['value'])))
- dokkg.add ((thing,
- SCHEMA.url,
- Literal ('/licenses/' + item['id']['value'])))
-
- offset += limit
- # Write graph to file
- dokkg.serialize ('topics.ttl', 'turtle')
|