-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_hackerrank.py
162 lines (130 loc) · 8.32 KB
/
scrape_hackerrank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
top_link_regex = re.compile('/rest/contests(.*?)(?=\")')
# Presses the escape key to exit a login popup
def press_esc(driver):
action = ActionChains(driver)
action.key_down(Keys.ESCAPE)
action.key_up(Keys.ESCAPE)
action.perform()
# Automatically logs into a login popup
def log_in(driver):
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'Log in')))
# Click the login tab
driver.find_element_by_id('Log in').click()
# Enter email
driver.find_element_by_id('input-4').send_keys("HackerrankDataCollection@gmail.com")
# Enter password
driver.find_element_by_id('input-5').send_keys("Tech4Good")
# click the login button
driver.find_element_by_css_selector('#hr_v2 > div.portal-wrapper > div > div > div > section > div > div > div.tab-list-content > div.login-form.auth-form.theme-m > form > div.form-item.clearfix > button').click()
def remove_html_escapes(s):
html_escapes = (
("'", '''),
('"', '"'),
('>', '>'),
('<', '<'),
('&', '&')
)
for code in html_escapes:
s = s.replace(code[1], code[0])
return s
# Removes all the html junk from the problem statement
def decode_statement_html(s):
s = remove_html_escapes(s)
# Replaces <span>...</span> elements with "[expression]". These are formatted math equations
s = re.sub('(<span)(.+)(?=</span>)(</span>)', '[expression]', s)
# Deletes <style>...</style> elements
s = re.sub('(<style)([\s\S]*)(?=</style>)(</style>)', '', s)
# Deletes anything else between <angle brackets> to remove the remaining html code
s = re.sub('((<)(.+?)(?=>))|>', '', s)
return s
# Removes all the html junk from the solution
def decode_solution_html(s):
s = remove_html_escapes(s)
# Deletes <pre>...</pre> elements
s = re.sub('((<pre)(.+?)(?=>)(>))|(</pre>)', '', s)
return s
# Gets the problem statement of the current problem
def get_problem_statement(driver, already_logged_in):
# This deals with a login screen that sometimes pops up
if(not already_logged_in):
log_in(driver)
else:
press_esc(driver)
# gets the html container of element, parses out all the html, and returns it
statement_element = driver.find_element_by_css_selector('#content > div > div > div > div.community-content > div > section > div > div > div > div.full-screen-split.split-wrap.left-pane > section.problem-statement.split > div > div > div > div > div > div.challenge_problem_statement > div > div')
statement_string = statement_element.get_attribute('innerHTML')
return decode_statement_html(statement_string)
def get_cpp_link(driver):
cpp_regex = re.compile('(ellipsis">)C\+\+')
deleted_regex = re.compile('\[deleted\]')
solutions_list = driver.find_elements_by_class_name('table-row-wrapper')
count = 1
for element in solutions_list:
row_html = element.get_attribute('innerHTML')
cpp_found = cpp_regex.findall(row_html)
deleted_found = deleted_regex.findall(row_html)
if len(cpp_found) != 0 and len(deleted_found) == 0:
cpp_link = top_link_regex.findall(row_html)
if len(cpp_link) != 0:
return cpp_link[0]
count += 1
return ""
# Gets the top solution code of the current problem
def get_problem_solution(driver, count):
action = ActionChains(driver) #ActionChain object that performs clicks
# Waits until it can click the Leaderboard tab and then clicks it
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#Leaderboard')))
if count == 0:
time.sleep(1.5)
leader_button = driver.find_element_by_css_selector('#Leaderboard')
action.click(leader_button)
action.perform()
# Here it tries to click the "unlock solutions" button
# if it doesn't exist, meaning it has already been clicked before,
# the wait.until() call will throw an exception and it just moves on
try:
#WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content > div > div > div > div.community-content > div > div.challenge-leaderboard > div > div > div.ui-tabs-wrap > div > section > div:nth-child(1) > div > button > div')))
WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content > div > div > div > div.community-content > div > div.challenge-leaderboard > div > div > div.ui-tabs-wrap > div > section > div:nth-child(1) > div > button > div')))
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#content > div > div > div > div.community-content > div > div.challenge-leaderboard > div > div > div.ui-tabs-wrap > div > section > div:nth-child(1) > div > button > div')))
unlock_button = driver.find_element_by_css_selector('#content > div > div > div > div.community-content > div > div.challenge-leaderboard > div > div > div.ui-tabs-wrap > div > section > div:nth-child(1) > div > button > div')
action.click(unlock_button)
action.perform()
# WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#hr_v2 > div.portal-wrapper > div > div > div > section > div > div.ui-dialog-body > div > div > button.btn.hr_primary-btn.hr-dialog-button')))
# driver.find_element_by_css_selector('#hr_v2 > div.portal-wrapper > div > div > div > section > div > div.ui-dialog-body > div > div > button.btn.hr_primary-btn.hr-dialog-button').click()
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#hr_v2 > div.portal-wrapper > div > div > div > section > div > div.ui-dialog-body > div > div > button.btn.hr_primary-btn.hr-dialog-button')))
driver.find_element_by_css_selector('#hr_v2 > div.portal-wrapper > div > div > div > section > div > div.ui-dialog-body > div > div > button.btn.hr_primary-btn.hr-dialog-button').click()
#time.sleep(2)
except:
x = "Do Nothing"
# Regex to parse out the link to the top solution
# Waits for the top solution's link to be found, then visits that link
# Sometimes there are no solution links and it returns 'SOLUTION NOT FOUND'
try:
#olution_link = get_cpp_selector(driver)
#WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content > div > div > div > div.community-content > div > div.challenge-leaderboard > div > div > div.ui-tabs-wrap > div > section > div.general-table-wrapper > div > div > div.table-body > div:nth-child(1) > div > div.table-row-column.ellipsis.solutions')))
#top_solution_link = top_link_regex.findall(driver.find_element_by_css_selector('#content > div > div > div > div.community-content > div > div.challenge-leaderboard > div > div > div.ui-tabs-wrap > div > section > div.general-table-wrapper > div > div > div.table-body > div:nth-child(1) > div > div.table-row-column.ellipsis.solutions').get_attribute('innerHTML'))[0]
#top_solution_link = top_link_regex.findall(driver.find_element_by_css_selector(solution_selector))
top_solution_link = get_cpp_link(driver)
driver.get("https://www.hackerrank.com/rest/contests/" + top_solution_link)
print("https://www.hackerrank.com/rest/contests/" + top_solution_link)
except:
return 'SOLUTION NOT FOUND'
# Waits until it finds the html solution element and then gets its html container
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body')))
top_solution_string = driver.find_element_by_css_selector('body').get_attribute('innerHTML')
top_solution_string = decode_solution_html(top_solution_string)
return top_solution_string
#Returns a final problem/solution pair
def get_problem(driver, problem_url, count):
driver.get(problem_url) #Goes to the problem url
statement_string = get_problem_statement(driver, count > 0) #gets the problem statement
top_solution_string = get_problem_solution(driver, count) #gets the top solution
return [statement_string, top_solution_string]