main.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import os
  4. import csv
  5. import time
  6. def write_csv(data): # конвертация данных в csv
  7. path_f = os.path.dirname(os.path.abspath(__file__))
  8. # 'a' - it appends a data in file
  9. with open(os.path.join(path_f, "testimonials.csv"), 'a', newline='', encoding='utf-8') as file_csv:
  10. writer_file = csv.writer(file_csv)
  11. writer_file.writerow([
  12. data['name'],
  13. data['url'],
  14. data['rating']
  15. ])
  16. def get_html(url): # получение dom-html по ссылке
  17. res = requests.get(url)
  18. return res.text
  19. def get_normalise_str(string): # нормализация данных
  20. rating = string.split(' ')[0]
  21. result = rating.replace(',', '')
  22. return result
  23. def get_data(html): # получение данных из html
  24. soup = BeautifulSoup(html, 'lxml') # на вход html и название парсера
  25. sections = soup.find_all('section')
  26. for section in sections: # перебираем секции
  27. articles = section.find_all('article')
  28. for item in articles: # получаем конечные данные
  29. header = item.find('h3').text
  30. link = item.find('a').get('href')
  31. rating = item.find('span', class_='rating-count').find('a').text
  32. result = get_normalise_str(rating)
  33. data = {'name': header, 'url': link, 'rating': result}
  34. write_csv(data)
  35. # print(data)
  36. def main():
  37. url = 'https://wordpress.org/plugins/'
  38. html = get_html(url)
  39. get_data(html)
  40. if __name__ == '__main__':
  41. start_time = time.time()
  42. main()
  43. print("--- %s seconds ---" % (time.time() - start_time))