Source code for metalparser.libs.darklyrics_utils

import re
import string

from bs4 import BeautifulSoup
from metalparser.common.scraping import ScrapingAgent
from metalparser.common.exceptions import ArtistNotFoundException, LyricsNotFoundException, SongsNotFoundException

[docs]class DarkLyricsHelper: """ A class with helpers for DarkLyricsApi Attributes ---------- BASE_URL : str base URL scraping_agent : ScrapingAgent The agent taking hand of HTTP requests Methods ------- get_base_url(self) Returns base URL. get_artist_page(self, artist) Returns a page related to an artist in form of a BeautifulSoup object. get_songs_links_from_artist(self, artist, album=None) Returns a links list containing all the lyrics URLs related to an artist or an album. get_albums_info_from_artist_page(self, artist_page, all_info=False): Given the artist page, returns infos about the albums. get_albums_info_from_url(self, url): Returns album info given the album's URL. get_lyrics_url_by_song(self, song, artist) Given a song title and the artist, returns the link related to the lyrics. get_lyrics_url_by_tag(self, link_tag) Given an <a> HTML tag related to a song's lyrics, returns the related URL. get_lyrics_by_url(self, url) Given an URL related to a song, returns the lyrics. """ def __init__(self, use_cache): self.BASE_URL = '' self.scraping_agent = ScrapingAgent(use_cache=use_cache)
[docs] def get_base_url(self): """ Returns base URL. Returns: [str] -- base URL """ return self.BASE_URL
[docs] def get_artist_page(self, artist): """ Returns a page related to an artist in form of a BeautifulSoup object. Arguments: artist {str} -- The artist's name Raises: ArtistNotFoundException: Exception raised when the URL is not found on Returns: [BeautifulSoup] -- Page related to an artist in form of a BeautifulSoup object """ url = self.__get_artist_url(artist) artist_page = self.scraping_agent.get_page_from_url(url) if 'not Found' in artist_page.title.string: raise ArtistNotFoundException( 'Artist page for "{}" not found at URL: {}. Is it on'.format(artist.title(), url) ) else: return artist_page
[docs] def get_albums_info_from_artist_page(self, artist_page, title_only=False): """ Given the artist page, retrieve infos about the albums. Arguments: artist_page {BeautifulSoup} -- The artist page in BeautifulSoup format. Keyword Arguments: all_info {bool} -- Flag to determinate if returning all albums info or title only (default: {False}) Returns: [list] -- List of albums (str list or dict list, depending on all_info) """ album_headlines = artist_page.find_all('h2') albums_list = [] for line in album_headlines: album_line_parts = line.text.split('"') is_valid_album_type = any(elem in album_line_parts[0].lower() for elem in ['album', 'ep', 'demo']) if(len(album_line_parts) > 1 and is_valid_album_type): if title_only is False: albums_list.append({ 'title': album_line_parts[1], 'type': album_line_parts[0].replace(':', '').strip(), 'release_year': line.text.split('"')[2].replace(')', '').replace('(', '').strip() }) else: albums_list.append(line.text.split('"')[1]) return albums_list
[docs] def get_albums_info_from_url(self, url): """ Returns album info given the album's URL. Arguments: url {str} -- The album's URL Returns: [dict] -- A dict with the following album info: title, release year and type (album, EP). """ if '../lyrics' in url: url = url.replace('../', self.BASE_URL) album_page = self.scraping_agent.get_page_from_url(url) album_info_text = album_page.select_one('div.albumlyrics > h2').text if 'non-album' in album_info_text: return { 'title': '', 'release_year': '', 'type': 'non-album songs' } else: return { 'title': album_info_text.split('"')[1], 'release_year': album_info_text.split('"')[2].replace(')', '').replace('(', '').strip(), 'type': album_info_text.split('"')[0].replace(':', '').strip() }
[docs] def get_lyrics_url_by_song(self, song, artist): """ Given a song title and the artist, returns the link related to the lyrics. Arguments: song {str} -- The title of the song artist {str} -- The artist's name Raises: LyricsNotFoundException: Exception raised when no link is found Returns: [str] -- The link related to the lyrics of the specified song """ url = self.__get_search_url(song, artist) search_page = self.scraping_agent.get_page_from_url(url) sens = search_page.find_all('div', class_='sen') for sen in sens: a = sen.find('a') if a: link = self.BASE_URL + a.get('href') if link.find('#') != -1: return link raise LyricsNotFoundException('Lyrics for "{}" not found at URL: {}'.format(song, url))
[docs] def get_lyrics_url_by_tag(self, link_tag): """ Given an <a> HTML tag related to a song's lyrics, returns the related URL. Arguments: link_tag {BeautifulSoup} -- <a> tag which is supposed to contain an URL related to lyrics Raises: LyricsNotFoundException: Exception raised when no link or invalid link is found Returns: [str] -- URL string contained in the specified <a> tag, leading to lyrics. """ if '/lyrics' in link_tag.attrs['href']: url = link_tag.attrs['href'] return url else: raise LyricsNotFoundException('Lyrics URL for the song "{}" not found.'.format(link_tag.text))
[docs] def get_lyrics_by_url(self, url): """ Given an URL related to a song, returns the lyrics. Arguments: url {str} -- URL leading to the lyrics of a certain song Raises: LyricsNotFoundException: Exception raised when no lyrics div is found Returns: [str] -- A string with the lyrics related to the specified URL """ if '../lyrics' in url: url = url.replace('../', self.BASE_URL) song_number = int(url.split('#')[1]) url = url.split('#')[0] lyrics_page = self.scraping_agent.get_page_from_url(url) lyrics_div = lyrics_page.find('div', class_='lyrics') if lyrics_div is None: raise LyricsNotFoundException('No lyrics found at URL: {}. Check if URL exists or try to clean the cache.'.format(url)) song_lyrics = lyrics_div.prettify().split('</h3>')[song_number] return self.__sanitize_lyrics(song_lyrics)
def __sanitize_lyrics(self, lyrics): """Clean the lyrics string.""" # remove tail sanitized_lyrics = lyrics[:lyrics.find('<h3>')] # Set linebreaks sanitized_lyrics = sanitized_lyrics.replace('<br/>', '') # Remove italic sanitized_lyrics = sanitized_lyrics.replace('</i>', '').replace('<i>', '') # Remove trailing divs sanitized_lyrics = sanitized_lyrics.split('<div')[0] # Remove duplicate blank lines split_lyrics = sanitized_lyrics.splitlines() sanitized_lyrics = '' for line_number in range(len(split_lyrics) - 1): line = split_lyrics[line_number].rstrip() next_line = split_lyrics[line_number + 1].rstrip() last_line = split_lyrics[max(line_number - 1, 0)].rstrip() if line != '' or (line == '' and next_line == '' and last_line != ''): sanitized_lyrics = sanitized_lyrics + '\n' + line # Remove starting/ending newlines sanitized_lyrics = sanitized_lyrics[1:-1] # Remove space after newline sanitized_lyrics = sanitized_lyrics.replace('\n ', '\n') # Remove leading and trailing spaces sanitized_lyrics = sanitized_lyrics.strip() return sanitized_lyrics def __get_search_url(self, song, artist): """Build an URL with a query usable by internal search engine.""" query = self.__sanitize_search_query(artist + ' ' + song) url = self.BASE_URL + 'search?q=' + query return url def __get_artist_url(self, artist): """Build an URL leading to the page of the specified artist.""" artist = self.__sanitize_artist_url(artist) if artist[0].isdigit(): index = '19' else: index = artist[0] return self.BASE_URL + index + '/' + artist + '.html' def __sanitize_artist_url(self, artist): """Clean a string and make it compatible to a artist URL""" # Lowercase artist = artist.lower() # Special cases artist = artist.replace('+\\-', '2').replace('vhäldemar', 'vhaldemar').replace('øscillatör', 'scillatr').replace('zamieć', 'zamiec') # Replace nordic chars with another letter artist = artist.replace('ø', 'o').replace('ö', 'o').replace('ü', 'u').replace('å', 'a').replace(u'æ', u'e') # Remove punctuation signs artist = re.sub(r'[' + re.escape(string.punctuation) + ']', '', artist) # Remove other special chars artist = re.sub(r'[äæøáéíóúýćïëöüêčďěňřšťžėūãõ]', '', artist) # Remove whitespaces artist = re.sub(r'[' + re.escape(string.whitespace) + ']', '', artist) return artist def __sanitize_search_query(self, query): """Clean a string and make it compatible to a search engine query""" # Lowercase query = query.lower() # Special cases query.replace('+\\-', '2').replace('vhäldemar', 'vhaldemar').replace('øscillatör', 'scillatr').replace('zamieć', 'zamiec') # Replace nordic chars with another letter query.replace('ø', 'o').replace('ö', 'o').replace('ü', 'u').replace('å', 'a').replace(u'æ', u'e') # Remove punctuation signs query = re.sub(r'[' + re.escape(string.punctuation) + ']', '', query) # Remove other special chars query = re.sub(r'[äæøáéíóúýćïëöüêčďěňřšťžėūãõ]', '', query) # Replace whitespaces with '+' query = re.sub(r'[' + re.escape(string.whitespace) + ']', '+', query) return query