linked_in_scraper.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. import time
  2. import json
  3. import csv
  4. import os
  5. import requests
  6. from bs4 import BeautifulSoup
  7. from jinja2 import Template
  8. import headers
  9. # these represent different job functions
  10. FUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FA
  11. SENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SE
  12. LOCATION_FACETS = [ #G
  13. 'us:8-2-0-1-2',
  14. 'us:97',
  15. 'us:va',
  16. 'us:dc',
  17. 'us:tx',
  18. 'us:ca',
  19. 'us:md',
  20. 'us:70',
  21. 'us:31',
  22. 'us:ny',
  23. 'us:8-8-0-8-1',
  24. 'us:8-8-0-3-1',
  25. 'us:ga',
  26. 'us:52',
  27. 'us:7',
  28. 'us:8-8-0-95-11',
  29. 'us:nj',
  30. 'us:3-2-0-31-1',
  31. ]
  32. FACETS = [
  33. ('FA', FUNCTION_FACETS),
  34. ('SE', SENIORITY_FACETS),
  35. ('G', LOCATION_FACETS)
  36. ]
  37. def download_file(url, local_filename=None):
  38. '''Downloads a file with requests
  39. from: https://stackoverflow.com/a/16696317
  40. '''
  41. if local_filename is None:
  42. local_filename = url.split('/')[-1]
  43. print('saving to', local_filename)
  44. r = requests.get(url, stream=True)
  45. with open(local_filename, 'wb') as f:
  46. for chunk in r.iter_content(chunk_size=1024):
  47. if chunk:
  48. f.write(chunk)
  49. return local_filename
  50. def get_page(company_id, facet=None, facet_id=None, start=0, count=50):
  51. '''Gets a single page of results from linkedin for a particular job function at a company'''
  52. params = {
  53. 'facet': ['CC'],
  54. 'facet.CC': company_id,
  55. 'count': count,
  56. 'start': start,
  57. }
  58. if facet is not None and facet_id is not None:
  59. params['facet'] = ['CC', facet]
  60. params['facet.' + facet] = facet_id
  61. response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
  62. return response.json()
  63. def get_company(company_id, outname):
  64. '''Gets all employees from a company using particular job functions'''
  65. people = []
  66. for facet, facet_ids in FACETS:
  67. for facet_id in facet_ids:
  68. print('getting facet', facet, facet_id, 'for company', company_id)
  69. count = 50
  70. start = 0
  71. results = get_page(company_id, facet, facet_id)
  72. total = results['pagination']['total']
  73. people += results['searchResults']
  74. start += count
  75. while start < total:
  76. print('getting', start, 'of', total)
  77. time.sleep(1)
  78. results = get_page(company_id, facet, facet_id, start)
  79. people += results['searchResults']
  80. start += count
  81. with open(outname, 'w') as outfile:
  82. json.dump(people, outfile, indent=2)
  83. return outname
  84. def get_images(datafile):
  85. '''Downloads profile images'''
  86. with open(datafile, 'r') as infile:
  87. people = json.load(infile)
  88. people = [p['member'] for p in people]
  89. for p in people:
  90. if 'vectorImage' not in p:
  91. continue
  92. pid = p['memberId']
  93. outname = 'images/{}.jpg'.format(pid)
  94. if os.path.exists(outname):
  95. print('skipping')
  96. continue
  97. url = p['vectorImage']['rootUrl']
  98. url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
  99. print(url)
  100. download_file(url, outname)
  101. time.sleep(1)
  102. def get_profile(pid):
  103. '''Downloads individual profiles'''
  104. outname = 'profiles/{}.json'.format(pid)
  105. if os.path.exists(outname):
  106. return outname
  107. out = {}
  108. url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
  109. print(url)
  110. response = requests.get(url, headers=headers.headers)
  111. soup = BeautifulSoup(response.text, 'html.parser')
  112. codes = soup.select('code')
  113. for c in codes:
  114. try:
  115. d = json.loads(c.text)
  116. if 'contactInfo' in d:
  117. out = d
  118. break
  119. except Exception as e:
  120. continue
  121. with open(outname, 'w') as outfile:
  122. json.dump(out, outfile)
  123. time.sleep(1)
  124. return outname
  125. def get_profiles(datafile):
  126. '''Gets all profiles'''
  127. with open(datafile, 'r') as infile:
  128. data = json.load(infile)
  129. for d in data:
  130. pid = d['member']['profileId']
  131. get_profile(pid)
  132. def clean_and_parse(datafile, outname):
  133. '''Outputs csv, json and html from employee listings'''
  134. out = []
  135. mids = []
  136. with open(datafile, 'r') as infile:
  137. data = json.load(infile)
  138. for d in data:
  139. mid = d['member']['memberId']
  140. pid = d['member']['profileId']
  141. imgpath = 'images/{}.jpg'.format(mid)
  142. if not os.path.exists(imgpath):
  143. imgpath = None
  144. item = {
  145. 'name': d['member'].get('formattedName', ''),
  146. 'title': d['member'].get('title', ''),
  147. 'img': imgpath,
  148. 'company': d['company'].get('companyName', ''),
  149. 'location': d['member'].get('location', ''),
  150. 'id': d['member']['memberId'],
  151. 'linkedin': 'https://linkedin.com/in/' + pid,
  152. }
  153. if mid not in mids:
  154. out.append(item)
  155. mids.append(mid)
  156. with open(outname + '.json', 'w') as jsonfile:
  157. json.dump(out, jsonfile, indent=2)
  158. with open(outname + '.csv', 'w') as csvfile:
  159. fieldnames = list(out[0].keys())
  160. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  161. writer.writeheader()
  162. for row in out:
  163. writer.writerow(row)
  164. with open('template.html', 'r') as templatefile:
  165. template = Template(templatefile.read())
  166. html = template.render(people=out)
  167. with open('index.html', 'w') as htmlout:
  168. htmlout.write(html)
  169. if __name__ == '__main__':
  170. ICE = '533534'
  171. datafile = 'ice_raw.json'
  172. get_company(ICE, datafile)
  173. get_profiles(datafile)
  174. get_images(datafile)
  175. clean_and_parse(datafile, 'ice')