Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix lrclib lyrics #5406

Open
wants to merge 6 commits into
base: lyrics-refactor-tests
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 150 additions & 27 deletions beetsplug/lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,15 @@
import struct
import unicodedata
import warnings
from functools import partial
from typing import TYPE_CHECKING, ClassVar
from contextlib import suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
from urllib.parse import quote, urlencode

import requests
from typing_extensions import TypedDict
from unidecode import unidecode

import beets
Expand Down Expand Up @@ -59,6 +63,7 @@
TAG_RE = re.compile(r"<[^>]*>")
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"

# The content for the base index.rst generated in ReST mode.
REST_INDEX_TEMPLATE = """Lyrics
Expand Down Expand Up @@ -96,6 +101,10 @@
"""


class NotFoundError(requests.exceptions.HTTPError):
pass


# Utilities.


Expand Down Expand Up @@ -266,37 +275,151 @@ def fetch(
raise NotImplementedError


class LRCLibItem(TypedDict):
"""Lyrics data item returned by the LRCLib API."""

id: int
name: str
trackName: str
artistName: str
albumName: str
duration: float | None
instrumental: bool
plainLyrics: str
syncedLyrics: str | None


@dataclass
@total_ordering
class LRCLyrics:
#: Percentage tolerance for max duration difference between lyrics and item.
DURATION_DIFF_TOLERANCE = 0.05

target_duration: float
duration: float
instrumental: bool
plain: str
synced: str | None

def __le__(self, other: LRCLyrics) -> bool:
"""Compare two lyrics items by their score."""
return self.dist < other.dist

@classmethod
def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
return cls(
target_duration,
candidate["duration"] or 0.0,
candidate["instrumental"],
candidate["plainLyrics"],
candidate["syncedLyrics"],
)

@cached_property
def duration_dist(self) -> float:
"""Return the absolute difference between lyrics and target duration."""
return abs(self.duration - self.target_duration)

@cached_property
def is_valid(self) -> bool:
"""Return whether the lyrics item is valid.
Lyrics duration must be within the tolerance defined by
:attr:`DURATION_DIFF_TOLERANCE`.
"""
return (
self.duration_dist
<= self.target_duration * self.DURATION_DIFF_TOLERANCE
)

@cached_property
def dist(self) -> tuple[float, bool]:
"""Distance/score of the given lyrics item.

Return a tuple with the following values:
1. Absolute difference between lyrics and target duration
2. Boolean telling whether synced lyrics are available.

Best lyrics match is the one that has the closest duration to
``target_duration`` and has synced lyrics available.
"""
return self.duration_dist, not self.synced

def get_text(self, want_synced: bool) -> str:
if self.instrumental:
return INSTRUMENTAL_LYRICS

if want_synced and self.synced:
return "\n".join(map(str.strip, self.synced.splitlines()))

return self.plain


class LRCLib(Backend):
base_url = "https://lrclib.net/api/get"
"""Fetch lyrics from the LRCLib API."""

def fetch(
BASE_URL = "https://lrclib.net/api"
GET_URL = f"{BASE_URL}/get"
SEARCH_URL = f"{BASE_URL}/search"

def warn(self, message: str, *args) -> None:
"""Log a warning message with the class name."""
self._log.warning(f"{self.__class__.__name__}: {message}", *args)

def fetch_json(self, *args, **kwargs):
"""Wrap the request method to raise an exception on HTTP errors."""
kwargs.setdefault("timeout", 10)
kwargs.setdefault("headers", {"User-Agent": USER_AGENT})
r = requests.get(*args, **kwargs)
if r.status_code == HTTPStatus.NOT_FOUND:
raise NotFoundError("HTTP Error: Not Found", response=r)
r.raise_for_status()

return r.json()

def fetch_candidates(
self, artist: str, title: str, album: str, length: int
) -> str | None:
params: dict[str, str | int] = {
"artist_name": artist,
"track_name": title,
}
) -> Iterator[list[LRCLibItem]]:
"""Yield lyrics candidates for the given song data.

Firstly, attempt to GET lyrics directly, and then search the API if
lyrics are not found or the duration does not match.

Return an iterator over lists of candidates.
"""
base_params = {"artist_name": artist, "track_name": title}
get_params = {**base_params, "duration": length}
if album:
params["album_name"] = album
get_params["album_name"] = album

if length:
params["duration"] = length
with suppress(NotFoundError):
yield [self.fetch_json(self.GET_URL, params=get_params)]

try:
response = requests.get(
self.base_url,
params=params,
timeout=10,
)
data = response.json()
except (requests.RequestException, json.decoder.JSONDecodeError) as exc:
self._log.debug("LRCLib API request failed: {0}", exc)
return None
yield self.fetch_json(self.SEARCH_URL, params=base_params)

if self.config["synced"]:
return data.get("syncedLyrics")
@classmethod
def pick_best_match(cls, lyrics: Iterable[LRCLyrics]) -> LRCLyrics | None:
"""Return best matching lyrics item from the given list."""
return min((li for li in lyrics if li.is_valid), default=None)

return data.get("plainLyrics")
def fetch(
self, artist: str, title: str, album: str, length: int
) -> str | None:
"""Fetch lyrics text for the given song data."""
fetch = partial(self.fetch_candidates, artist, title, album, length)
make = partial(LRCLyrics.make, target_duration=length)
pick = self.pick_best_match
try:
return next(
filter(None, map(pick, (map(make, x) for x in fetch())))
).get_text(self.config["synced"])
except StopIteration:
pass
except requests.JSONDecodeError:
self.warn("Could not decode response JSON data")
except requests.RequestException as exc:
self.warn("Request error: {}", exc)

return None


class DirectBackend(Backend):
Expand Down Expand Up @@ -478,7 +601,7 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
string="This song is an instrumental",
):
self._log.debug("Detected instrumental")
return "[Instrumental]"
return INSTRUMENTAL_LYRICS
else:
self._log.debug("Couldn't scrape page using known layouts")
return None
Expand Down Expand Up @@ -725,7 +848,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:


class LyricsPlugin(plugins.BeetsPlugin):
SOURCES = ["google", "musixmatch", "genius", "tekstowo", "lrclib"]
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
SOURCE_BACKENDS = {
"google": Google,
"musixmatch": MusiXmatch,
Expand Down
7 changes: 7 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ Bug fixes:
* :doc:`plugins/lyrics`: Do not attempt to search for lyrics if either the
artist or title is missing and ignore ``artist_sort`` value if it is empty.
:bug:`2635`
* :doc:`plugins/lyrics`: Fix fetching lyrics from ``lrclib`` source. If we
cannot find lyrics for a specific album, artist, title combination, the
plugin now tries to search for the artist and title and picks the most
relevant result. Update the default ``sources`` configuration to prioritize
``lrclib`` over other sources since it returns reliable results quicker than
others.
:bug:`5102`

For packagers:

Expand Down
2 changes: 1 addition & 1 deletion docs/plugins/lyrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ configuration file. The available options are:
sources known to be scrapeable.
- **sources**: List of sources to search for lyrics. An asterisk ``*`` expands
to all available sources.
Default: ``google genius tekstowo lrclib``, i.e., all the available sources. The
Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The
``google`` source will be automatically deactivated if no ``google_API_key``
is setup.
The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if
Expand Down
Loading