123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- #!/usr/bin/env python
- import re
- import html
- import time
- import calendar
- import json
- from pprint import pprint
- from pathlib import Path
- import requests
- from bs4 import BeautifulSoup as bs
- LINK_MAIN = 'https://m.facebook.com'
- LINK_LOGIN = 'https://m.facebook.com/login/device-based/regular/login/?refsrc=https://m.facebook.com/&lwv=100&refid=8'
- LINK_FRIENDS = 'https://m.facebook.com/friends/center/friends/'
- LINK_LANGUAGE = 'https://m.facebook.com/a/language.php?l=en_US&lref=%2Fsettings%2Flanguage%2F&sref=legacy_mobile_settings&gfid=AQADZjSUoWMlr7lH'
- PATTERN_HOVERCARD_NAME = re.compile(r'<a class="b[qo]" href="(?P<link>.+?)">(?P<name>.+?)</a>')
- PATTERN_SEEMORE = re.compile(r'[0ki]"><a href="(.+?)"><span>See More</span></a>')
- PATTERN_PROFILE = re.compile(r'<div class="(?:bc|x)"><a href="(.+?)"')
- PATTERN_DEACTIVATED = re.compile(r'This account has been deactivated.')
- PATTERN_ABOUT = re.compile(r'[dgl]"><a href="(.+?)" class="\w\w">About</a>')
- PATTERN_RATELIMIT = re.compile(r'We limit how often you can post, comment or do other things in a given amount of time in order to help protect the community from spam. You can try again later.')
- PATTERN_BIRTHDAY = re.compile(r'Birthday</span></div></td><td valign="top" class="\w\w"><div class="\w\w">(?P<month>\w+?) (?P<day>\d+)([, ]+(?P<year>\d+)){0,}</div>')
- list_friends = dict()
- data_dict = dict()
- headers = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'Accept-Language': 'en-US,en;q=0.9',
- 'Accept-Encoding': 'gzip, deflate',
- }
- session = requests.Session()
- print('Loading m.facebook.com')
- result = session.get(LINK_MAIN)
- soup = bs(result.content, 'lxml')
- form = soup.select_one('form')
- for nput in form.select('input'):
- if not nput.get('name') in ['sign_up']:
- data_dict[nput.get('name')] = nput.get('value')
- data_dict['login'] = 'Log In'
- data_dict['email'] = ''
- data_dict['pass'] = r""
- print('Logging in')
- time.sleep(5)
- result = session.post(LINK_LOGIN,data=data_dict)
- print('Logged in!')
- time.sleep(5)
- result = session.get(LINK_LANGUAGE)
- print('Language reset')
- print('Reading friends JSON')
- with open('data/' + 'list_friends.json', 'r') as f:
- list_friends = json.load(f)
- print('Populating friends')
- result = session.get(LINK_FRIENDS)
- # with open('data/downloaded.html', 'wb') as f:
- # f.write(result.content)
- while True:
- # matches = PATTERN_HOVERCARD_NAME.findall(result.text)
- soup = bs(result.content, 'lxml')
- soup_match = None
- soup_matches = soup.select('#friends_center_main table td > a')
- for soup_match in soup_matches:
- name_safe = soup_match.get_text().title()
- link_safe = soup_match.get('href')
- if name_safe not in list_friends:
- print('Added:', name_safe)
- list_friends[name_safe] = {'link_hovercard': LINK_MAIN + link_safe}
- # pprint(list_friends)
- # link_next = PATTERN_SEEMORE.search(result.text)
- soup_match_next = None
- soup_match_next = soup.select_one('#friends_center_main div > a')
- if not soup_match_next:
- # if soup_match_next:
- break
- link_next_safe = soup_match_next.get('href')
- time.sleep(10)
- result = session.get(LINK_MAIN + link_next_safe)
- print('Done populating friends!')
- print('Writing friends JSON')
- with open('data/' + 'list_friends.json', 'w') as f:
- json.dump(list_friends, f, indent=2, ensure_ascii=False)
- print('Done writing friends JSON')
- for person in list_friends:
- # Visit hovercard link and grab profile link
- reprocess = False
- # if 'link_profile' in list_friends[person] and list_friends[person]['link_profile'] == 'DEACTIVATED':
- # reprocess = True
- if 'link_profile' not in list_friends[person] or reprocess:
- link = list_friends[person]['link_hovercard']
- time.sleep(10)
- result = session.get(link)
- with open('data/' + 'hovercard_' + person + '.html', 'wb') as f:
- f.write(result.content)
- # match = None
- # match = PATTERN_PROFILE.search(result.text)
- soup = bs(result.content, 'lxml')
- soup_match = None
- soup_match = soup.select_one('#objects_container table td div div:nth-of-type(3) > a')
- print('Visit hovercard > Get profile link:', person)
- if soup_match and soup_match.span.get_text() == 'View Profile':
- link_profile_safe = soup_match.get('href')
- list_friends[person]['link_profile'] = LINK_MAIN + link_profile_safe
- elif PATTERN_DEACTIVATED.search(result.text):
- list_friends[person]['link_profile'] = "DEACTIVATED"
- with open('data/' + 'list_friends.json', 'w') as f:
- json.dump(list_friends, f, indent=2, ensure_ascii=False)
- for person in list_friends:
- # Visit profile link and get about link
- if not list_friends[person]['link_profile'] == 'DEACTIVATED':
- if 'link_about' not in list_friends[person]:
- link = list_friends[person]['link_profile']
- time.sleep(30)
- result = session.get(link)
- with open('data/' + 'timeline_' + person + '.html', 'wb') as f:
- f.write(result.content)
- # match = None
- # match = PATTERN_ABOUT.search(result.text)
- soup = bs(result.content, 'lxml')
- soup_match = None
- soup_match = soup.select_one('#m-timeline-cover-section > div:nth-of-type(4) > a')
- soup_match_ratelimit = (soup.title.text == "You Can't Use This Feature Right Now")
- print('Visit profile > Get about link:', person)
- if soup_match and soup_match.get_text() == 'About':
- link_about_safe = soup_match.get('href')
- list_friends[person]['link_about'] = LINK_MAIN + link_about_safe
- else:
- print('ERROR: No about link!')
- with open('data/' + 'list_friends.json', 'w') as f:
- json.dump(list_friends, f, indent=2, ensure_ascii=False)
- for person in list_friends:
- # Visit about link and save html to parse later
- if 'link_about' in list_friends[person]:
- filename = 'data/' + 'about_' + person + '.html'
- file = Path(filename)
- redownload = False
- if file.is_file():
- with open(filename) as f:
- file_html = f.read()
- soup = bs(file_html, 'lxml')
- soup_match = None
- soup_match = (soup.title.text == "You Can't Use This Feature Right Now") or \
- (soup.title.text == "Content Not Found") or \
- (soup.title.text == "Error Facebook") or \
- (soup.title.text == "Profile Pictures")
- # match = None
- # match = PATTERN_RATELIMIT.search(file_html)
- if soup_match:
- redownload = True
- if (not file.is_file()) or redownload:
- print('Visit About > Save HTML:', person)
- link = list_friends[person]['link_about']
- time.sleep(30)
- result = session.get(link)
- with open(filename, 'wb') as f:
- f.write(result.content)
- # for person in list_friends:
- # # Print Name
- # print('')
- # print(person)
- # # Name
- # fullname = person
- # names = fullname.split(' ')
- # if len(names) == 3:
- # name_first = names[0]
- # name_mid = names[1]
- # name_last = names[2]
- # elif len(names) == 2:
- # name_first = names[0]
- # name_mid = ''
- # name_last = names[1]
- # elif len(names) == 4:
- # name_first = names[0]
- # name_mid = names[1] + ',' + names[2]
- # name_last = names[3]
- # else:
- # name_first = ''
- # name_mid = ''
- # name_last = ''
- # name = '{family};{given};{additional};{prefix};{suffix}'.format(family=name_last, given=name_first, additional=name_mid, prefix='', suffix='')
- # print('FN:' + fullname)
- # print('N:' + name)
- # pprint(list_friends)
|