123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- # Author: Omar Vega Ramos
- # E-mail: ovruni@riseup.net
- # License: GNU GPL - GNU General Public License v3.0 or later
- # http://www.gnu.org/licenses/gpl.html
- from urllib.parse import quote_plus
- from urllib.request import urlopen
- #from urllib2 import urlopen
- import json
- from lxml import html
- # Get pages from category
- def get_pages_by_category(category_name, limit=20, cmcontinue=None):
- url = domain + '/w/api.php?action=query&list=categorymembers&format=json'
- url = url + '&cmtitle=Category:' + category_name.replace(' ', '_')
- url = url + '&cmlimit=' + str(limit)
- # Check if cmcontinue exists
- if cmcontinue != None:
- url = url + '&cmcontinue=' + cmcontinue
- # Get monuments list
- response = urlopen(url)
- data = json.loads(response.read())
- response.close()
- page_list = data['query']['categorymembers']
- # Check if exists more pages
- if 'continue' in data:
- cmcontinue = data['continue']['cmcontinue']
- new_page_list = page_list + get_pages_by_category(category_name, limit, cmcontinue)
- return new_page_list
- else:
- return page_list
- domain = 'https://commons.wikimedia.org'
- category_name = 'Cultural heritage monuments in Peru with known IDs'
- # Get total pages from category
- pages_by_category = get_pages_by_category(category_name.replace(' ', '_'), 200)
- # Get monuments IDs
- for page in pages_by_category:
- #title = page['title'].replace(' ', '_')
- #url = domain + '/wiki/' + quote_plus(title)
- url = domain + '/wiki/' + quote_plus(page['title'].replace(' ', '_'))
- response = urlopen(url)
- content = response.read()
- response.close()
- tree = html.fromstring(content)
- # Get monument ID
- monument_id = tree.find('.//td[@class="description"]/.//a').text
- print('"' + monument_id + '", ' + url)
|