ovruni
/
wlmperu


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
							#!/usr/bin/python
# -*- coding: utf-8 -*-
# Author: Omar Vega Ramos
# E-mail: ovruni@riseup.net
# License: GNU GPL - GNU General Public License v3.0 or later
# http://www.gnu.org/licenses/gpl.html

from urllib.parse import quote_plus
from urllib.request import urlopen
#from urllib2 import urlopen

import json
from lxml import html

# Get pages from category
def get_pages_by_category(category_name, limit=20, cmcontinue=None):
    url = domain + '/w/api.php?action=query&list=categorymembers&format=json'
    url = url + '&cmtitle=Category:' + category_name.replace(' ', '_')
    url = url + '&cmlimit=' + str(limit)

    # Check if cmcontinue exists
    if cmcontinue != None:
        url = url + '&cmcontinue=' + cmcontinue

    # Get monuments list
    response = urlopen(url)
    data = json.loads(response.read())
    response.close()
    page_list = data['query']['categorymembers']

    # Check if exists more pages
    if 'continue' in data:
        cmcontinue = data['continue']['cmcontinue']
        new_page_list = page_list + get_pages_by_category(category_name, limit, cmcontinue)
        return new_page_list
    else:
        return page_list

domain = 'https://commons.wikimedia.org'
category_monuments = 'Images from Wiki Loves Monuments 2017 in Peru'
category_monuments_without_id = 'Cultural heritage monuments in Peru with known IDs'

# Get total pages from categories
pages = get_pages_by_category(category_monuments.replace(' ', '_'), 200)
pages_with_id = []

for page in get_pages_by_category(category_monuments_without_id.replace(' ', '_'), 200):
    pages_with_id.append(page['title'])

# Get monuments without IDs
for page in pages:
    url = domain + '/wiki/' + quote_plus(page['title'].replace(' ', '_'))

    if page['title'] not in pages_with_id:
        print(page['title'] + ", " + url)