topics.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. #!/usr/bin/env python3
  2. import requests
  3. import time
  4. from rdflib import BNode, Graph, Literal, Namespace, URIRef
  5. from rdflib.namespace import RDF
  6. from string import Template
  7. dokkg = Graph ()
  8. SCHEMA = Namespace ('http://schema.org/')
  9. dokkg.bind ('schema', SCHEMA)
  10. def query (query_string, limit=100000, offset=0):
  11. """Query the SPARQL endpoint."""
  12. return requests.post (
  13. 'http://localhost:3030/dokk/sparql',
  14. timeout = 60,
  15. data = {
  16. 'format': 'json',
  17. 'query': Template (query_string).substitute ({ 'limit': limit, 'offset': offset })
  18. }).json ()
  19. QUERY_MANPAGES = """
  20. PREFIX graph: <graphdata:name:>
  21. PREFIX manpage-terms: <urn:manpage:terms:>
  22. PREFIX manpage: <urn:manpage:>
  23. PREFIX schema: <http://schema.org/>
  24. SELECT ?name ?description
  25. FROM graph:manpages
  26. WHERE
  27. {
  28. [] a schema:TextDigitalDocument ;
  29. schema:identifier ?name ;
  30. schema:disambiguatingDescription ?description .
  31. }
  32. LIMIT $limit
  33. OFFSET $offset
  34. """
  35. # Some licenses have more than one name, so we select only one using MIN
  36. # which selects the string with capital letters, eg. "BSD 2-Clause" instead
  37. # of "BSD 2-clause"
  38. QUERY_SPDX = """
  39. PREFIX graph: <graphdata:name:>
  40. PREFIX schema: <http://schema.org/>
  41. PREFIX spdx: <http://spdx.org/rdf/terms#>
  42. SELECT ?id (MIN (?_name) AS ?name)
  43. FROM graph:spdx
  44. WHERE
  45. {
  46. ?license a spdx:License ;
  47. spdx:licenseId ?id ;
  48. spdx:name ?_name .
  49. FILTER NOT EXISTS { ?license spdx:isDeprecatedLicenseId [] }
  50. }
  51. GROUP BY ?id
  52. LIMIT $limit
  53. OFFSET $offset
  54. """
  55. limit = 50000
  56. offset = 0
  57. while True:
  58. data = query (QUERY_MANPAGES, limit=limit, offset=offset)
  59. if len (data['results']['bindings']) == 0:
  60. break
  61. data = data['results']['bindings']
  62. for item in data:
  63. thing = URIRef ('https://dokk.org/manpages/' + item['name']['value'])
  64. dokkg.add ((thing,
  65. RDF.type,
  66. SCHEMA.Thing))
  67. dokkg.add ((thing,
  68. SCHEMA.name,
  69. Literal (item['name']['value'])))
  70. dokkg.add ((thing,
  71. SCHEMA.description,
  72. Literal (item['description']['value'].capitalize ())))
  73. dokkg.add ((thing,
  74. SCHEMA.url,
  75. Literal ('/manpages/' + item['name']['value'])))
  76. offset += limit
  77. limit = 50000
  78. offset = 0
  79. while True:
  80. data = query (QUERY_SPDX, limit=limit, offset=offset)
  81. if len (data['results']['bindings']) == 0:
  82. break
  83. data = data['results']['bindings']
  84. for item in data:
  85. thing = URIRef ('https://dokk.org/licenses/' + item['id']['value'])
  86. dokkg.add ((thing,
  87. RDF.type,
  88. SCHEMA.Thing))
  89. dokkg.add ((thing,
  90. SCHEMA.name,
  91. Literal (item['name']['value'])))
  92. dokkg.add ((thing,
  93. SCHEMA.description,
  94. Literal (item['id']['value'])))
  95. dokkg.add ((thing,
  96. SCHEMA.url,
  97. Literal ('/licenses/' + item['id']['value'])))
  98. offset += limit
  99. # Write graph to file
  100. dokkg.serialize ('topics.ttl', 'turtle')