1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- import requests
- from bs4 import BeautifulSoup
- import os
- import csv
- import time
- def write_csv(data): # конвертация данных в csv
- path_f = os.path.dirname(os.path.abspath(__file__))
- # 'a' - it appends a data in file
- with open(os.path.join(path_f, "testimonials.csv"), 'a', newline='', encoding='utf-8') as file_csv:
- writer_file = csv.writer(file_csv)
- writer_file.writerow([
- data['name'],
- data['url'],
- data['rating']
- ])
- def get_html(url): # получение dom-html по ссылке
- res = requests.get(url)
- return res.text
- def get_normalise_str(string): # нормализация данных
- rating = string.split(' ')[0]
- result = rating.replace(',', '')
- return result
- def get_data(html): # получение данных из html
- soup = BeautifulSoup(html, 'lxml') # на вход html и название парсера
- sections = soup.find_all('section')
- for section in sections: # перебираем секции
- articles = section.find_all('article')
- for item in articles: # получаем конечные данные
- header = item.find('h3').text
- link = item.find('a').get('href')
- rating = item.find('span', class_='rating-count').find('a').text
- result = get_normalise_str(rating)
- data = {'name': header, 'url': link, 'rating': result}
- write_csv(data)
- # print(data)
- def main():
- url = 'https://wordpress.org/plugins/'
- html = get_html(url)
- get_data(html)
- if __name__ == '__main__':
- start_time = time.time()
- main()
- print("--- %s seconds ---" % (time.time() - start_time))
|