get_users.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # Author: Omar Vega Ramos
  4. # E-mail: ovruni@riseup.net
  5. # License: GNU GPL - GNU General Public License v3.0 or later
  6. # http://www.gnu.org/licenses/gpl.html
  7. import re
  8. import requests
  9. from lxml import html
  10. # Get subcategories from category
  11. def get_subcategories(category_url):
  12. page = requests.get(category_url)
  13. tree = html.fromstring(page.content)
  14. subcategories_container = tree.find('.//div[@id="mw-subcategories"]')
  15. subcategory_list = []
  16. # Check if the category has subcategories
  17. if subcategories_container == None:
  18. return subcategory_list
  19. subcategory_objects = subcategories_container.findall('.//li/div/div/a')
  20. for subcategory_object in subcategory_objects:
  21. subcategory_list.append(subcategory_object.text)
  22. new_subcategory_list = subcategory_list + get_subcategories(domain + subcategory_object.get('href'))
  23. return new_subcategory_list
  24. # Get pages from category
  25. def get_pages(category_url):
  26. page = requests.get(category_url)
  27. tree = html.fromstring(page.content)
  28. pages_container = tree.find('.//div[@id="mw-pages"]')
  29. page_list = []
  30. # Check if the category has pages
  31. if pages_container == None:
  32. return page_list
  33. page_objects = pages_container.findall('.//li/a')
  34. for page_object in page_objects:
  35. page_list.append(page_object.text)
  36. next_link = pages_container.find('./a[2]')
  37. if next_link != None:
  38. if next_link.text == 'página siguiente':
  39. new_page_list = page_list + get_pages(domain + next_link.get('href'))
  40. return new_page_list
  41. else:
  42. return page_list
  43. else:
  44. return page_list
  45. domain = 'https://es.wikipedia.org'
  46. category_name = 'Wikipedia:Wikipedistas de Perú'
  47. users = []
  48. # Get total pages from category
  49. categories = get_subcategories(domain + '/wiki/Categoría:' + category_name.replace(' ', '_'))
  50. categories.append(category_name)
  51. for category in categories:
  52. pages_by_category = get_pages(domain + '/wiki/Categoría:' + category.replace(' ', '_'))
  53. # Get user pages
  54. for page in pages_by_category:
  55. # Get only user page like 'Usuario:XXXX'
  56. search_user = re.search(r'^Usuari[a|o]:([^/]+)', page, re.M|re.I)
  57. if search_user != None:
  58. user = search_user.group(0)
  59. # Skip duplicate users and userboxes
  60. if user not in users and user.find(':Userbox') != 7:
  61. users.append(user)
  62. print(user)