txtsd
/
fb2vcard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
							#!/usr/bin/env python

from bs4 import BeautifulSoup as bs
import csv
# from fuzzywuzzy import fuzz
import glob
from pprint import pprint
import re
import calendar
import phonenumbers

DIR_DATA = 'data/'
PATTERN_BIRTHDAY = re.compile(r'(?P<month>\w+?) (?P<day>\d+)([, ]+(?P<year>\d+)){0,}')
FIELDNAMES = ['Name','Given Name','Additional Name','Family Name','Yomi Name','Given Name Yomi','Additional Name Yomi','Family Name Yomi','Name Prefix','Name Suffix','Initials','Nickname','Short Name','Maiden Name','Birthday','Gender','Location','Billing Information','Directory Server','Mileage','Occupation','Hobby','Sensitivity','Priority','Subject','Notes','Language','Photo','Group Membership','E-mail 1 - Type','E-mail 1 - Value','IM 1 - Type','IM 1 - Service','IM 1 - Value','Website 1 - Type','Website 1 - Value','Phone 1 - Type','Phone 1 - Value']


list_files = glob.glob(DIR_DATA + 'about_*.html')
list_of_things = set()

with open('from_facebook.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=FIELDNAMES)
    writer.writeheader()

for file in list_files:
    # print(file)
    with open(file) as f:
        file_data = f.read()
        soup = bs(file_data, 'lxml')

        # Look for files with incorrect content
        match = (soup.title.text == "You Can't Use This Feature Right Now") or \
            (soup.title.text == "Content Not Found") or \
            (soup.title.text == "Error Facebook")
        if match:
            print(file, 'is wrong. Skipping.')
            print()
            continue

        # Name
        match = soup.select_one('div span div span strong')
        # name_full = match.get_text()
        name_full = ''.join(text for text in match.find_all(text=True) if text.parent.name != 'span')

        # Alternate Name
        match_1 = match.select_one('.alternate_name')
        name_alternate = None
        if match_1:
            name_alternate = match_1.get_text()[1:-1]

        # Birthday
        birthday = None
        birthday_google = None
        match = soup.find(text='Birthday')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text()
            if match_1:
                match_2 = PATTERN_BIRTHDAY.search(match_1)
                if match_2:
                    year = None
                    month = match_2['month']
                    day = match_2['day']
                    if match_2.group(3):
                        year = match_2['year']
                    num_month = format(list(calendar.month_name).index(month), '02d')
                    num_day = day.zfill(2)
                    if year:
                        birthday = year + num_month + num_day
                        birthday_google = num_month + '/' + num_day + '/' + year
                    else:
                        birthday = '--' + num_month + num_day
                        birthday_google = num_month + '/' + num_day

        # Gender
        gender = None
        match = soup.find(text='Gender')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                gender = match_1

        # Email
        list_email = list()
        matches = soup.find_all(text='Email')
        if matches:
            for match in matches:
                match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
                if match_1:
                    list_email.append(match_1)

        # Adress
        relationship = None
        match = soup.find(text='Relationship')
        if match:
            match_1 = match.parent.parent.parent.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                relationship = match_1

        # Relationship
        address = None
        match = soup.find(text='Address')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text()
            if match_1:
                address = match_1

        # facebook link
        link_facebook = None
        match = soup.find(text='Facebook')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                link_facebook = match_1

        # instagram link
        link_instagram = None
        match = soup.find(text='Instagram')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                link_instagram = match_1

        # YouTube link
        link_youtube = None
        match = soup.find(text='YouTube')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                link_youtube = match_1

        # Twitter link
        link_twitter = None
        match = soup.find(text='Twitter')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                link_twitter = match_1

        # Generic website link
        list_link_websites = list()
        matches = soup.find_all(text='Websites')
        if matches:
            for match in matches:
                match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
                if match_1:
                    list_link_websites.append(match_1)

        # Other website link
        link_other = None
        match = soup.find(text='Other Service')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                link_other = match_1

        # Tumblr link
        link_tumblr = None
        match = soup.find(text='Tumblr')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                link_tumblr = match_1

        # Snapchat
        social_snapchat = None
        match = soup.find(text='Snapchat')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                social_snapchat = match_1

        # eBuddy
        social_ebuddy = None
        match = soup.find(text='eBuddy')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                social_ebuddy = match_1

        # LINE
        social_line = None
        match = soup.find(text='LINE')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                social_line = match_1

        # Skype
        social_skype = None
        match = soup.find(text='Skype')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                social_skype = match_1

        # Current City
        city_current = None
        match = soup.find(text='Current City')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                city_current = match_1

        # Hometown
        city_home = None
        match = soup.find(text='Hometown')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                city_home = match_1

        # Mobile
        number_mobile = None
        match = soup.find(text='Mobile')
        if match:
            match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
            if match_1:
                number_parsed = phonenumbers.parse(match_1, 'IN')
                number_rfc3966 = phonenumbers.format_number(number_parsed, phonenumbers.PhoneNumberFormat.RFC3966)
                number_mobile = number_rfc3966


        # # Print All
        # print(name_full)
        # if name_alternate:
        #     print(name_alternate)
        # if birthday:
        #     print(birthday)
        # if gender:
        #     print(gender)
        # if list_email:
        #     for email in list_email:
        #         print(email)
        # if relationship:
        #     print(relationship)
        # if address:
        #     print(address)
        # if link_facebook:
        #     print(link_facebook)
        # if link_instagram:
        #     print(link_instagram)
        # if link_youtube:
        #     print(link_youtube)
        # if link_twitter:
        #     print(link_twitter)
        # if list_link_websites:
        #     for link in list_link_websites:
        #         print(link)
        # if link_other:
        #     print(link_other)
        # if link_tumblr:
        #     print(link_tumblr)
        # if social_snapchat:
        #     print(social_snapchat)
        # if social_ebuddy:
        #     print(social_ebuddy)
        # if social_line:
        #     print(social_line)
        # if social_skype:
        #     print(social_skype)
        # if city_current:
        #     print(city_current)
        # if city_home:
        #     print(city_home)
        # if number_mobile:
        #     print(number_mobile)


        # Save to CSV
        with open('from_facebook.csv', 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=FIELDNAMES)
            csv_dict = dict()
            csv_dict['Name'] = name_full
            if birthday:
                csv_dict['Birthday'] = birthday
            if gender:
                csv_dict['Gender'] = gender
            if list_email:
                csv_dict['E-mail 1 - Value'] = email
            if number_mobile:
                csv_dict['Phone 1 - Value'] = number_mobile
            writer.writerow(csv_dict)


#         # Output Contact Info Types
#         match = soup.select('#contact-info > div > div:nth-of-type(2) table tr td div span')
#         print(name_full, match)
#         for thing in match:
#             list_of_things.add(thing.get_text())
# pprint(list_of_things)