-
Notifications
You must be signed in to change notification settings - Fork 13
/
youthopscrape.py
95 lines (74 loc) · 2.74 KB
/
youthopscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import csv
import re
import requests
from bs4 import BeautifulSoup
csv_file = open('exchange programmes.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Image Link', 'headline', 'summary', 'Website link', 'Deadline'])
for i in range(1, 5):
code = requests.get('https://www.youthop.com/exchange-programs/page/' + str(i))
soup = BeautifulSoup(code.text, 'html.parser')
for para in soup.find_all('div', class_='post-header'):
_a = para.a.get('href')
code = requests.get(_a)
soup = BeautifulSoup(code.text, 'html.parser')
# image of the opportunity
try:
article_image = soup.find('div', class_='article-media')
image = article_image.img.get('src')
print(image)
except AttributeError as e:
print(
"OOPS image")
print(str(e))
except UnicodeEncodeError as e:
print(
"OOPS image")
print(str(e))
# headline of the opportunity
try:
article_headline = soup.find(id="main")
headline = article_headline.h1.text
headline = re.sub(r'[^\x00-\x7F]+', ' ', headline)
print(headline)
except AttributeError as e:
print(
"OOPS headline")
print(str(e))
# short summary about the opportunity
try:
article_para = soup.find('div', class_='article-content') # opp paragraph
summary = article_para.p.text
summary = re.sub(r'[^\x00-\x7F]+', ' ', summary)
print(summary)
except AttributeError as e:
print(
"OOPS content")
print(str(e))
# deadline date
try:
deadline_date = soup.find('ul', class_='post-details')
date = deadline_date.li.text
date = re.sub(r'[^\x00-\x7F]+', ' ', date)
print(date)
except AttributeError as e:
print(
"OOPS deadline")
print(str(e))
# Link for the official website
try:
all_link = soup.find('div', class_='application-process') # apply now link
_a_list = all_link.find_all('a')
apply_link = _a_list[0].get('href')
web_link = _a_list[1].get('href')
print(web_link)
except IndexError as m:
print(
"OOPS link")
print(str(m))
except AttributeError as e:
print(
"OOPS link")
print(str(e))
csv_writer.writerow([image, headline, summary, web_link, date])
csv_file.close()