-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScraping_tools.py
122 lines (85 loc) · 4.05 KB
/
Scraping_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
from bs4 import BeautifulSoup as soup
import re
import pandas as pd
from langdetect import detect
def create_table_artist_link_lyrics(artist):
"""This function receives an artist name as input and return
a dataframe containing the name of the artist, title of the song
and link for the song"""
# Path to the first page of artist's songs
path = f'https://www.metrolyrics.com/{artist}-lyrics.html'
artist_request = requests.get(path)
if (artist_request.status_code != 200):
print("It seems this artist does not exist on www.metrolyrics.com,\n"
"Try again with a valid artist name in https://www.metrolyrics.com/artistname-lyrics.html")
else:
soup_artist = soup(artist_request.text, 'html.parser')
Title = []
Link = []
# Check the number of songs in the first page to know if we need to check next pages
Nb_first_page = len(soup_artist.find_all(
class_="songs-table compact")[0].find_all('a'))
for link in soup_artist.find_all(class_="songs-table compact")[0].find_all('a'):
Title.append(link.get('title')), Link.append(link.get('href'))
# If < 75 it means we only have one page for this artist
if Nb_first_page < 75:
# Remove artist name from title
pattern = f"(?i){artist}\s(.+)\s+lyrics"
Title = [re.findall(pattern, text)[0] for text in Title]
df_artist = pd.DataFrame({'Title': Title, 'Link Lyrics': Link})
df_artist['Name'] = f'{artist}'
# df_artist.to_csv(f'{artist}_link.csv')
# If >= 75 songs, it needs to scrap more pages
else:
for link in soup_artist.find_all(class_="pages")[0].find_all('a'):
path_next_page = link.get('href')
artist_request_nextpage = requests.get(path_next_page)
soup_artist_next = soup(
artist_request_nextpage.text, 'html.parser')
for link in soup_artist_next.find_all(class_="songs-table compact")[0].find_all('a'):
Title.append(link.get('title')), Link.append(
link.get('href'))
# Remove artist name from title
pattern = f"(?i){artist}\s(.+)\s+lyrics"
Title = [re.findall(pattern, text)[0] for text in Title]
# Create pandas dataframe to save to disk
df_artist = pd.DataFrame({'Title': Title, 'Link Lyrics': Link})
# Add a column with artist's name
df_artist['Name'] = f'{artist}'
# df_artist.to_csv(f'{artist}_link.csv')
return df_artist
def get_lyrics(url_lyrics):
"""Function that take the url of a Lyric
and return the text of the lyric as a string"""
lyrics_request = requests.get(url_lyrics)
print(lyrics_request.status_code)
if (lyrics_request.status_code != 200):
print('It seems the link is not available, It will be removed from database')
lyrics = None
elif (lyrics_request.url != url_lyrics):
print(f'It seems the link {url_lyrics} is being redirected, It will be removed from database')
lyrics = None
# If the link exists we can scrap the text
else:
artist_lyrics = soup(lyrics_request.text, 'html.parser')
lyrics = []
print('url_lyrics')
for each in artist_lyrics.find_all(id="lyrics-body-text")[0].find_all('p'):
print(len(each))
print(each.get_text())
# Obtain only the text of the lyrics from html page
lyrics.append(each.get_text())
# Join the text into one and clean the return line symbols \n
lyrics = '. '.join(lyrics)
lyrics = lyrics.split('\n')
lyrics = " ".join(lyrics)
return lyrics
def add_text_Lyrics_column(data_artist):
Nb_songs = data_artist.shape[0]
Text_lyrics = []
for i in range(Nb_songs):
print(f'Song number {i}')
Text_lyrics.append(get_lyrics(data_artist['Link Lyrics'].iloc[i]))
data_artist['Text Lyrics'] = Text_lyrics
return data_artist