|
| 1 | +# Linkedin My_Connections Scrapper |
| 2 | +# Written by XZANATOL |
| 3 | +from selenium.webdriver.common.action_chains import ActionChains |
| 4 | +from optparse import OptionParser |
| 5 | +from selenium import webdriver |
| 6 | +import pandas as pd |
| 7 | +import time |
| 8 | +import sys |
| 9 | +import re |
| 10 | + |
| 11 | +pattern_name = "\\n(.+)\\n" # Used to extract names |
| 12 | +pattern_headline = 'occupation\\n(.+)\\n' # Used to extract headlines |
| 13 | + |
| 14 | +# Help menu |
| 15 | +usage = """ |
| 16 | +<Script> [Options] |
| 17 | + |
| 18 | +[Options] |
| 19 | + -h, --help Show this help message and exit. |
| 20 | + -e, --email Enter login email |
| 21 | + -p, --password Enter login password |
| 22 | + -s, --skills Flag to scrap each profile, and look at its skill set |
| 23 | + |
| 24 | +Operation Modes: |
| 25 | +> Basic mode |
| 26 | + This will scrap all LinkedIn connections list with there corresponding Name, Headline, and Profile link. |
| 27 | +> Skills scrapper mode (-s/--skills) |
| 28 | + (Time Consuming mode) |
| 29 | + This will do the same job of basic mode but along with visiting each profile and extracting the skills of each. |
| 30 | +""" |
| 31 | + |
| 32 | +# Load args |
| 33 | +parser = OptionParser() |
| 34 | +parser.add_option("-e", "--email", dest="email", help="Enter login email") |
| 35 | +parser.add_option("-p", "--password", dest="password", help="Enter login password") |
| 36 | +parser.add_option("-s", "--skills", action="store_true", dest="skills", help="Flag to scrap each profile, and look at its skill set") |
| 37 | + |
| 38 | + |
| 39 | +def login(email, password): |
| 40 | + """LinkedIn automated login function""" |
| 41 | + # Get LinkedIn login page |
| 42 | + driver = webdriver.Chrome("chromedriver.exe") |
| 43 | + driver.get("https://www.linkedin.com") |
| 44 | + # Locate Username field and fill it |
| 45 | + session_key = driver.find_element_by_name("session_key") |
| 46 | + session_key.send_keys(email) |
| 47 | + # Locate Password field and fill it |
| 48 | + session_password = driver.find_element_by_name("session_password") |
| 49 | + session_password.send_keys(password) |
| 50 | + # Locate Submit button and click it |
| 51 | + submit = driver.find_element_by_class_name("sign-in-form__submit-button") |
| 52 | + submit.click() |
| 53 | + # Check credentials output |
| 54 | + if driver.title != "LinkedIn": |
| 55 | + print("Provided E-mail/Password is wrong!") |
| 56 | + driver.quit() |
| 57 | + sys.exit() |
| 58 | + # Return session |
| 59 | + return driver |
| 60 | + |
| 61 | + |
| 62 | +def scrap_basic(driver): |
| 63 | + """Returns 3 lists of Names, Headlines, and Profile Links""" |
| 64 | + driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/") |
| 65 | + # Bypassing Ajax Call through scrolling the page up and down multiple times |
| 66 | + # Base case is when the height of the scroll bar is constant after 2 complete scrolls |
| 67 | + time_to_wait = 3 # Best interval for a 512KB/Sec download speed - Change it according to your internet speed |
| 68 | + last_height = driver.execute_script("return document.body.scrollHeight") |
| 69 | + while True: |
| 70 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom |
| 71 | + |
| 72 | + # This loop is for bypassing a small bug upon scrolling that causes the Ajax call to be cancelled |
| 73 | + for i in range(2): |
| 74 | + time.sleep(time_to_wait) |
| 75 | + driver.execute_script("window.scrollTo(0, 0);") # Scroll up to top |
| 76 | + time.sleep(time_to_wait) |
| 77 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom |
| 78 | + |
| 79 | + new_height = driver.execute_script("return document.body.scrollHeight") # Update scroll bar height |
| 80 | + if new_height == last_height: |
| 81 | + break |
| 82 | + last_height = new_height |
| 83 | + |
| 84 | + # Extract card without links |
| 85 | + extracted_scrap = driver.find_elements_by_class_name("mn-connection-card__details") |
| 86 | + extracted_scrap = [ _.text for _ in extracted_scrap ] |
| 87 | + # Append data to a seperate list |
| 88 | + names = [] |
| 89 | + headlines = [] |
| 90 | + for card in extracted_scrap: |
| 91 | + # Try statements just in case of headline/name type errors |
| 92 | + try: |
| 93 | + names.append( re.search(pattern_name, card)[0] ) |
| 94 | + except: |
| 95 | + names.append(" ") |
| 96 | + |
| 97 | + try: |
| 98 | + headlines.append( re.search(pattern_headline, card)[0] ) |
| 99 | + except: |
| 100 | + headlines.append(" ") |
| 101 | + |
| 102 | + |
| 103 | + # Extract links |
| 104 | + extracted_scrap = driver.find_elements_by_tag_name('a') |
| 105 | + links = [] |
| 106 | + for i in extracted_scrap: |
| 107 | + link = i.get_attribute("href") |
| 108 | + if "https://www.linkedin.com/in" in link and not link in links: |
| 109 | + links.append(link) |
| 110 | + # Return outputs |
| 111 | + return driver, names, headlines, links |
| 112 | + |
| 113 | + |
| 114 | +def scrap_skills(driver, links): |
| 115 | + skill_set = [] |
| 116 | + length = len(links) |
| 117 | + for i in range(length): |
| 118 | + link = links[i] # Get profile link |
| 119 | + driver.get(link) |
| 120 | + |
| 121 | + # Bypassing Ajax Call through scrolling through profile multiple sections |
| 122 | + time_to_wait = 3 |
| 123 | + last_height = driver.execute_script("return document.body.scrollHeight") |
| 124 | + while True: |
| 125 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom |
| 126 | + |
| 127 | + # This loop is for bypassing a small bug upon scrolling that causes the Ajax call to be cancelled |
| 128 | + for i in range(2): |
| 129 | + time.sleep(time_to_wait) |
| 130 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight/4);") |
| 131 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3);") |
| 132 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);") |
| 133 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight*3/4);") |
| 134 | + time.sleep(time_to_wait) |
| 135 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom |
| 136 | + |
| 137 | + new_height = driver.execute_script("return document.body.scrollHeight") # Update scroll bar height |
| 138 | + if new_height == last_height: |
| 139 | + break |
| 140 | + last_height = new_height |
| 141 | + |
| 142 | + # Locate button |
| 143 | + buttons = driver.find_elements_by_tag_name('button') |
| 144 | + length = len(buttons) |
| 145 | + for button_num in range(length): |
| 146 | + i = buttons[button_num].get_attribute("data-control-name") |
| 147 | + if i == "skill_details": |
| 148 | + button = buttons[button_num] |
| 149 | + break |
| 150 | + # Scroll then click the button |
| 151 | + actions = ActionChains(driver) |
| 152 | + actions.move_to_element(button).click().perform() |
| 153 | + # Finally extract the skills |
| 154 | + skills = driver.find_elements_by_xpath("//*[starts-with(@class,'pv-skill-category-entity__name-text')]") |
| 155 | + skill_set_list = [] |
| 156 | + for skill in skills: |
| 157 | + skill_set_list.append(skill.text) |
| 158 | + # Append each skill set to its corresponding name |
| 159 | + skill_set.append(" -- ".join(skill_set_list)) # Appending all to one string |
| 160 | + # Return session & skills |
| 161 | + return driver, skill_set |
| 162 | + |
| 163 | + |
| 164 | +def save_to_csv(names, headlines, links, skills): |
| 165 | + # If skills argument was false |
| 166 | + if skills is None: |
| 167 | + skills = [None]*len(names) |
| 168 | + # Make a dataframe and append data to it |
| 169 | + df = pd.DataFrame() |
| 170 | + for i in range(len(names)): |
| 171 | + df = df.append({"Name":names[i], "Headline":headlines[i], "Link":links[i], "Skills":skills[i]}, ignore_index=True) |
| 172 | + # Save to CSV |
| 173 | + df.to_csv("scrap.csv", index=False, columns=["Name", "Headline", "Link", "Skills"]) |
| 174 | + |
| 175 | + |
| 176 | +# Start checkpoint |
| 177 | +if __name__ == "__main__": |
| 178 | + (options, args) = parser.parse_args() |
| 179 | + |
| 180 | + # Inputs |
| 181 | + email = options.email |
| 182 | + password = options.password |
| 183 | + skills = options.skills |
| 184 | + |
| 185 | + driver = login(email, password) # Login Phase |
| 186 | + print("Successfull Login!") |
| 187 | + print("Commencing 'My-Connections' list scrap...") |
| 188 | + driver, names, headlines, links = scrap_basic(driver) # Basic Scrap Phase |
| 189 | + print("Finished basic scrap, scrapped {}".format(len(names))) |
| 190 | + |
| 191 | + if skills: |
| 192 | + print("Commencing 'Skills' scrap...") |
| 193 | + driver, skill_set = scrap_skills(driver, links) # Skills Scrap Phase |
| 194 | + print("Finished Skills scrap.") |
| 195 | + print("Saving to CSV file...") |
| 196 | + save_to_csv(names, headlines, links, skill_set) # Save to CSV |
| 197 | + else: |
| 198 | + save_to_csv(names, headlines, links, None) # Save to CSV |
| 199 | + |
| 200 | + print("Scrapping session has ended.") |
| 201 | + # End Session |
| 202 | + driver.quit() |
| 203 | + |
| 204 | + |
0 commit comments