get_monuments_with_ids.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # Author: Omar Vega Ramos
  4. # E-mail: ovruni@riseup.net
  5. # License: GNU GPL - GNU General Public License v3.0 or later
  6. # http://www.gnu.org/licenses/gpl.html
  7. from urllib.parse import quote_plus
  8. from urllib.request import urlopen
  9. #from urllib2 import urlopen
  10. import json
  11. from lxml import html
  12. # Get pages from category
  13. def get_pages_by_category(category_name, limit=20, cmcontinue=None):
  14. url = domain + '/w/api.php?action=query&list=categorymembers&format=json'
  15. url = url + '&cmtitle=Category:' + category_name.replace(' ', '_')
  16. url = url + '&cmlimit=' + str(limit)
  17. # Check if cmcontinue exists
  18. if cmcontinue != None:
  19. url = url + '&cmcontinue=' + cmcontinue
  20. # Get monuments list
  21. response = urlopen(url)
  22. data = json.loads(response.read())
  23. response.close()
  24. page_list = data['query']['categorymembers']
  25. # Check if exists more pages
  26. if 'continue' in data:
  27. cmcontinue = data['continue']['cmcontinue']
  28. new_page_list = page_list + get_pages_by_category(category_name, limit, cmcontinue)
  29. return new_page_list
  30. else:
  31. return page_list
  32. domain = 'https://commons.wikimedia.org'
  33. category_name = 'Cultural heritage monuments in Peru with known IDs'
  34. # Get total pages from category
  35. pages_by_category = get_pages_by_category(category_name.replace(' ', '_'), 200)
  36. # Get monuments IDs
  37. for page in pages_by_category:
  38. #title = page['title'].replace(' ', '_')
  39. #url = domain + '/wiki/' + quote_plus(title)
  40. url = domain + '/wiki/' + quote_plus(page['title'].replace(' ', '_'))
  41. response = urlopen(url)
  42. content = response.read()
  43. response.close()
  44. tree = html.fromstring(content)
  45. # Get monument ID
  46. monument_id = tree.find('.//td[@class="description"]/.//a').text
  47. print('"' + monument_id + '", ' + url)