get_monuments_without_ids.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # Author: Omar Vega Ramos
  4. # E-mail: ovruni@riseup.net
  5. # License: GNU GPL - GNU General Public License v3.0 or later
  6. # http://www.gnu.org/licenses/gpl.html
  7. from urllib.parse import quote_plus
  8. from urllib.request import urlopen
  9. #from urllib2 import urlopen
  10. import json
  11. from lxml import html
  12. # Get pages from category
  13. def get_pages_by_category(category_name, limit=20, cmcontinue=None):
  14. url = domain + '/w/api.php?action=query&list=categorymembers&format=json'
  15. url = url + '&cmtitle=Category:' + category_name.replace(' ', '_')
  16. url = url + '&cmlimit=' + str(limit)
  17. # Check if cmcontinue exists
  18. if cmcontinue != None:
  19. url = url + '&cmcontinue=' + cmcontinue
  20. # Get monuments list
  21. response = urlopen(url)
  22. data = json.loads(response.read())
  23. response.close()
  24. page_list = data['query']['categorymembers']
  25. # Check if exists more pages
  26. if 'continue' in data:
  27. cmcontinue = data['continue']['cmcontinue']
  28. new_page_list = page_list + get_pages_by_category(category_name, limit, cmcontinue)
  29. return new_page_list
  30. else:
  31. return page_list
  32. domain = 'https://commons.wikimedia.org'
  33. category_monuments = 'Images from Wiki Loves Monuments 2017 in Peru'
  34. category_monuments_without_id = 'Cultural heritage monuments in Peru with known IDs'
  35. # Get total pages from categories
  36. pages = get_pages_by_category(category_monuments.replace(' ', '_'), 200)
  37. pages_with_id = []
  38. for page in get_pages_by_category(category_monuments_without_id.replace(' ', '_'), 200):
  39. pages_with_id.append(page['title'])
  40. # Get monuments without IDs
  41. for page in pages:
  42. url = domain + '/wiki/' + quote_plus(page['title'].replace(' ', '_'))
  43. if page['title'] not in pages_with_id:
  44. print(page['title'] + ", " + url)