Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Serene-Arc authored Jul 11, 2024
2 parents 9122722 + 79449b0 commit 61e885c
Show file tree
Hide file tree
Showing 4 changed files with 1,195 additions and 30 deletions.
69 changes: 40 additions & 29 deletions beetsplug/lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,40 +448,51 @@ def _scrape_lyrics_from_html(self, html):
# Sometimes, though, it packages the lyrics into separate divs, most
# likely for easier ad placement

lyrics_div = soup.find("div", {"data-lyrics-container": True})

if lyrics_div:
lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
if not lyrics_divs:
self._log.debug("Received unusual song page html")
return self._try_extracting_lyrics_from_non_data_lyrics_container(
soup
)
lyrics = ""
for lyrics_div in lyrics_divs:
self.replace_br(lyrics_div)
lyrics += lyrics_div.get_text() + "\n\n"
while lyrics[-1] == "\n":
lyrics = lyrics[:-1]
return lyrics

if not lyrics_div:
self._log.debug("Received unusual song page html")
verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
if not verse_div:
if soup.find(
"div",
class_=re.compile("LyricsPlaceholder__Message"),
string="This song is an instrumental",
):
self._log.debug("Detected instrumental")
return "[Instrumental]"
else:
self._log.debug("Couldn't scrape page using known layouts")
return None
def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
"""Extract lyrics from a div without attribute data-lyrics-container
This is the second most common layout on genius.com
"""
verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
if not verse_div:
if soup.find(
"div",
class_=re.compile("LyricsPlaceholder__Message"),
string="This song is an instrumental",
):
self._log.debug("Detected instrumental")
return "[Instrumental]"
else:
self._log.debug("Couldn't scrape page using known layouts")
return None

lyrics_div = verse_div.parent
self.replace_br(lyrics_div)
lyrics_div = verse_div.parent
self.replace_br(lyrics_div)

ads = lyrics_div.find_all(
"div", class_=re.compile("InreadAd__Container")
)
for ad in ads:
ad.replace_with("\n")
ads = lyrics_div.find_all(
"div", class_=re.compile("InreadAd__Container")
)
for ad in ads:
ad.replace_with("\n")

footers = lyrics_div.find_all(
"div", class_=re.compile("Lyrics__Footer")
)
for footer in footers:
footer.replace_with("")
footers = lyrics_div.find_all(
"div", class_=re.compile("Lyrics__Footer")
)
for footer in footers:
footer.replace_with("")
return lyrics_div.get_text()


Expand Down
1 change: 1 addition & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Bug fixes:

* Improved naming of temporary files by separating the random part with the file extension.
* Fixed the ``auto`` value for the :ref:`reflink` config option.
* Fixed lyrics plugin only getting part of the lyrics from ``Genius.com`` :bug:`4815`

For packagers:

Expand Down
12 changes: 11 additions & 1 deletion test/plugins/test_lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,17 @@ def test_good_lyrics(self):
"""Ensure we are able to scrape a page with lyrics"""
url = "https://genius.com/Ttng-chinchilla-lyrics"
mock = MockFetchUrl()
self.assertIsNotNone(genius._scrape_lyrics_from_html(mock(url)))
lyrics = genius._scrape_lyrics_from_html(mock(url))
self.assertIsNotNone(lyrics)
self.assertEqual(lyrics.count("\n"), 28)

def test_good_lyrics_multiple_divs(self):
"""Ensure we are able to scrape a page with lyrics"""
url = "https://genius.com/2pac-all-eyez-on-me-lyrics"
mock = MockFetchUrl()
lyrics = genius._scrape_lyrics_from_html(mock(url))
self.assertIsNotNone(lyrics)
self.assertEqual(lyrics.count("\n"), 133)

# TODO: find an example of a lyrics page with multiple divs and test it

Expand Down
1,143 changes: 1,143 additions & 0 deletions test/rsrc/lyrics/geniuscom/2pacalleyezonmelyrics.txt

Large diffs are not rendered by default.

0 comments on commit 61e885c

Please sign in to comment.