"""
Utilities functions.
"""
import string
import urllib
LYRICS = [
('<i>', ''),
('</i>', ''),
('\n', ''),
('<br/>', '\n'),
]
[docs]def capitalize(string_raw):
"""Capitalize a string, even if it is between quotes like ", '.
Args:
string_raw (string): text to capitalize.
Returns:
string
"""
# return re.sub(r"\b[\w']", lambda m: m.group().capitalize(), string.lower())
return string.capwords(string_raw.lower())
[docs]def name_to_wiki(name):
"""Process artist, album and song's name.
Args:
name:
Returns:
"""
name = name.strip()
name_wiki = capitalize(name)
return name_wiki
[docs]def name_to_wiki_id(name):
"""Generate a `Lyric Wiki` ID from a name.
Args:
name (string): name of an artist / song.
Returns:
string
"""
name_wiki = name_to_wiki(name)
name_wiki_id = '_'.join(name_wiki.split(' '))
name_wiki_id = urllib.parse.quote(name_wiki_id, safe=':/._-()%,')
return name_wiki_id
[docs]def parse_song_title(song_title, artist_name=None):
"""Split a song title to retrieve the artist name and song name.
Additional argument can be added to better retrieve these names.
Args:
song_title (string): song header (or title for the ``<a>`` element)
artist_name (string, optional): name of the artist.
Returns:
tuple
"""
song_title = song_title.replace('(page does not exist)', '').strip()
song_title = song_title.replace('//', '')
if artist_name and artist_name in song_title:
# Handles when the artist name is composed of ':' character (ex: 'Ex:Re')
# Or when the song does (ex: 'Re: Stacks')
# Knowing the artist name, find the separator index between the artist name and song
title_parts = song_title.replace('/wiki/', '').split(':')
artist_name_parts = artist_name.split(':')
artist_name_nparts = len(artist_name_parts)
artist_name_song = ':'.join(title_parts[:artist_name_nparts])
song_name = ':'.join(title_parts[artist_name_nparts:])
else:
artist_name_song = song_title.split(':')[0]
song_name = song_title.split(':')[1]
return artist_name_song, song_name
[docs]def process_lyrics(lyrics):
"""Process lyrics.
Args:
lyrics (string): lyrics to tokenize / modify.
Returns:
string
"""
lyrics_new = str(lyrics).split('<div class="lyricbox">')[-1].split('<div class="lyricsbreak">')[0]
lyrics_new = lyrics_new.split('<b>')[-1].split('</b>')[0]
lyrics_new = lyrics_new.replace('Instrumental', '')
lyrics_new = lyrics_new.encode('utf-8', errors='replace').decode("utf-8")
for char in LYRICS:
lyrics_new = lyrics_new.replace(*char)
return lyrics_new
import unidecode
[docs]def serialize_list(list_raw):
"""Serialize a list in ASCII format, so it can be saved as a JSON.
Args:
list_raw (list):
Returns:
list
"""
list_serialized = []
for value in list_raw:
if isinstance(value, list):
list_serialized.append(serialize_list(value))
elif isinstance(value, dict):
list_serialized.append(serialize_dict(value))
else:
list_serialized.append(unidecode.unidecode(str(value)))
return list_serialized
[docs]def serialize_dict(dict_raw):
"""Serialize a dictionary in ASCII format so it can be saved as a JSON.
Args:
dict_raw (dict):
Returns:
dict
"""
dict_serialized = {}
for (key, value) in dict_raw.items():
if isinstance(value, list):
dict_serialized[unidecode.unidecode(str(key))] = serialize_list(value)
elif isinstance(value, dict):
dict_serialized[unidecode.unidecode(str(key))] = serialize_dict(value)
else:
dict_serialized[unidecode.unidecode(str(key))] = unidecode.unidecode(str(value))
return dict_serialized