12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- # Author: Omar Vega Ramos
- # E-mail: ovruni@riseup.net
- # License: GNU GPL - GNU General Public License v3.0 or later
- # http://www.gnu.org/licenses/gpl.html
- import re
- import requests
- from lxml import html
- # Get subcategories from category
- def get_subcategories(category_url):
- page = requests.get(category_url)
- tree = html.fromstring(page.content)
- subcategories_container = tree.find('.//div[@id="mw-subcategories"]')
- subcategory_list = []
- # Check if the category has subcategories
- if subcategories_container == None:
- return subcategory_list
- subcategory_objects = subcategories_container.findall('.//li/div/div/a')
- for subcategory_object in subcategory_objects:
- subcategory_list.append(subcategory_object.text)
- new_subcategory_list = subcategory_list + get_subcategories(domain + subcategory_object.get('href'))
- return new_subcategory_list
- # Get pages from category
- def get_pages(category_url):
- page = requests.get(category_url)
- tree = html.fromstring(page.content)
- pages_container = tree.find('.//div[@id="mw-pages"]')
- page_list = []
- # Check if the category has pages
- if pages_container == None:
- return page_list
- page_objects = pages_container.findall('.//li/a')
- for page_object in page_objects:
- page_list.append(page_object.text)
- next_link = pages_container.find('./a[2]')
- if next_link != None:
- if next_link.text == 'página siguiente':
- new_page_list = page_list + get_pages(domain + next_link.get('href'))
- return new_page_list
- else:
- return page_list
- else:
- return page_list
- domain = 'https://es.wikipedia.org'
- category_name = 'Wikipedia:Wikipedistas de Perú'
- users = []
- # Get total pages from category
- categories = get_subcategories(domain + '/wiki/Categoría:' + category_name.replace(' ', '_'))
- categories.append(category_name)
- for category in categories:
- pages_by_category = get_pages(domain + '/wiki/Categoría:' + category.replace(' ', '_'))
- # Get user pages
- for page in pages_by_category:
- # Get only user page like 'Usuario:XXXX'
- search_user = re.search(r'^Usuari[a|o]:([^/]+)', page, re.M|re.I)
- if search_user != None:
- user = search_user.group(0)
- # Skip duplicate users and userboxes
- if user not in users and user.find(':Userbox') != 7:
- users.append(user)
- print(user)
|