-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.js
111 lines (89 loc) · 2.9 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
const puppeteer = require('puppeteer')
const fs = require('fs')
const { count } = require('console')
let getUrlsFromLetras = async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('https://www.letras.mus.br/mais-acessadas/indie/')
const result = await page.evaluate(() => {
let urls = []
document.querySelectorAll('.g-2-3 > ol > li > a')
.forEach((url) => urls.push(url.getAttribute('href')))
return urls
})
browser.close()
return result
}
let getLyricsFromLetras = async () => {
let urls = await getUrlsFromLetras();
const browser = await puppeteer.launch()
const page = await browser.newPage()
let count = 0
for await (url of urls) {
count++
console.log('https://www.letras.mus.br' + url)
await page.goto('https://www.letras.mus.br' + url, {
waitUntil: 'load',
timeout: 0
})
let result = await page.evaluate(() => {
let lyrics = ""
document.querySelectorAll('article > div > .cnt-letra > p')
.forEach((paragraph) => {
lyrics = lyrics + "\n" + paragraph.innerText;
})
return lyrics
})
fs.writeFile("lyrics2/lyrics" + count, result, "UTF-8", (err) => {
if (err) console.log(err)
else console.log("file saved as lyrics" + count)
})
}
browser.close()
}
let getUrlsFromPlaylist = async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('https://www.vagalume.com.br/playlisteiros/os-melhores-indies-lancados-nessa-decada/')
const result = await page.evaluate(() => {
let urls = []
document.querySelectorAll('.lyric')
.forEach((url) => urls.push(url.getAttribute('href')))
return urls
})
browser.close()
return result
}
let getLyricsFromVagalume = async () => {
let urls = await getUrlsFromPlaylist();
let count = 0
const browser = await puppeteer.launch()
for await (url of urls) {
count++
console.log('https://www.vagalume.com.br' + url)
const page = await browser.newPage()
await page.goto('https://www.vagalume.com.br' + url, {
waitUntil: 'load',
timeout: 0
})
let result = await page.evaluate(() => {
let lyrics = ""
document.querySelectorAll('#lyrics')
.forEach((paragraph) => {
lyrics = lyrics + "\n" + paragraph.innerText;
})
return lyrics
})
fs.writeFile("playlist1/lyrics" + count, result, "UTF-8", (err) => {
if (err) console.log(err)
else console.log("file saved as lyrics" + count)
})
}
browser.close()
}
getLyricsFromLetras().then(() => {
console.log("Finished")
})
// getLyricsFromVagalume().then(() => {
// console.log("Finished")
// })