1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- # Author: Omar Vega Ramos
- # E-mail: ovruni@riseup.net
- # License: GNU GPL - GNU General Public License v3.0 or later
- # http://www.gnu.org/licenses/gpl.html
- from urllib.parse import quote_plus
- from urllib.request import urlopen
- #from urllib2 import urlopen
- import json
- from lxml import html
- # Get pages from category
- def get_pages_by_category(category_name, limit=20, cmcontinue=None):
- url = domain + '/w/api.php?action=query&list=categorymembers&format=json'
- url = url + '&cmtitle=Category:' + category_name.replace(' ', '_')
- url = url + '&cmlimit=' + str(limit)
- # Check if cmcontinue exists
- if cmcontinue != None:
- url = url + '&cmcontinue=' + cmcontinue
- # Get monuments list
- response = urlopen(url)
- data = json.loads(response.read())
- response.close()
- page_list = data['query']['categorymembers']
- # Check if exists more pages
- if 'continue' in data:
- cmcontinue = data['continue']['cmcontinue']
- new_page_list = page_list + get_pages_by_category(category_name, limit, cmcontinue)
- return new_page_list
- else:
- return page_list
- domain = 'https://commons.wikimedia.org'
- category_monuments = 'Images from Wiki Loves Monuments 2017 in Peru'
- category_monuments_without_id = 'Cultural heritage monuments in Peru with known IDs'
- # Get total pages from categories
- pages = get_pages_by_category(category_monuments.replace(' ', '_'), 200)
- pages_with_id = []
- for page in get_pages_by_category(category_monuments_without_id.replace(' ', '_'), 200):
- pages_with_id.append(page['title'])
- # Get monuments without IDs
- for page in pages:
- url = domain + '/wiki/' + quote_plus(page['title'].replace(' ', '_'))
- if page['title'] not in pages_with_id:
- print(page['title'] + ", " + url)
|