Source code for lyricsfandom.scrape

"""
Functions used to connect, extract, and display data from lyrics fandom website.

These functions are used to scrape data from ``HTML`` page connection. They are used inside ``Artist, Album, Song``
classes.

The major part of this functions used a soup parameter, i.e. a ``Beautiful Soup`` ``Tag`` element
on a wab page (usually the whole page, not just a ``<div>`` or other ``HTML`` elements.
"""

import bs4

from .utils import *


[docs]def generate_artist_url(artist_name):
    """Generate a `Lyric Wiki` url of an artist page from its name.

    Args:
        artist_name (string): name of the Artist.

    Returns:
        string

    Examples::
        >>> artist_name = 'london grammar'
        >>> generate_artist_url(artist_name)
            https://lyrics.fandom.com/wiki/London_Grammar

    """
    artist_id = name_to_wiki_id(artist_name)
    return f'https://lyrics.fandom.com/wiki/{artist_id}'


[docs]def generate_album_url(artist_name, album_name, album_year):
    """Generate a `Lyric Wiki` url from of an album page from its artist and name / year.

    Args:
        artist_name (string): name of the Artist.
        album_name (string): name of the Album.
        album_year (string): year of an Album.

    Returns:
        string


    Examples::
        >>> artist_name = 'london grammar'
        >>> album_name = 'if you wait'
        >>> album_year = 2013
        >>> generate_album_url(artist_name, album_name, album_year)
            https://lyrics.fandom.com/wiki/London_Grammar:If_You_Wait_(2013)

    """
    artist_id = name_to_wiki_id(artist_name)
    album_id = name_to_wiki_id(album_name)
    return f'https://lyrics.fandom.com/wiki/{artist_id}:{album_id}_({album_year})'


[docs]def get_external_links(soup):
    """Retrieve the different links from a `Lyric Wiki` page.
    The links returned can be found in the `External Links` page section,
    and usually references to other platforms (like Last.fm, Amazon, iTunes etc.).

    Args:
        soup (bs4.element.Tag): connection to the `Lyric Wiki` page.

    Returns:
        dict

    Examples::
        >>> # Import packages
        >>> import bs4  # for web scrapping
        >>> import urllib.request  # to connect

        >>> # Set Up: connect to a lyric wiki page
        >>> USER = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        >>> HEADERS = {'User-Agent': USER}
        >>> URL = 'https://lyrics.fandom.com/wiki/London_Grammar:Who_Am_I'
        >>> req = urllib.request.Request(URL, headers=HEADERS)
        >>> page = urllib.request.urlopen(req)
        >>> soup = bs4.BeautifulSoup(page, 'lxml')

        >>> # Retrieve links from the page
        >>> get_external_links(soup)
            {'Amazon': ['https://www.amazon.com/exec/obidos/redirect?link_code=ur2&tag=wikia-20&camp=1789&creative=9325&path=https%3A%2F%2Fwww.amazon.com%2Fdp%2FB00J0QJ84E'],
             'Last.fm': ['https://www.last.fm/music/London+Grammar',
              'https://www.last.fm/music/London+Grammar/If+You+Wait'],
             'iTunes': ['https://itunes.apple.com/us/album/695805771'],
             'AllMusic': ['https://www.allmusic.com/album/mw0002559862'],
             'Discogs': ['http://www.discogs.com/master/595953'],
             'MusicBrainz': ['https://musicbrainz.org/release-group/dbf36a9a-df02-41c4-8fa9-5afe599960b0'],
             'Spotify': ['https://open.spotify.com/album/0YTj3vyjZmlfp16S2XGo50']}

    """
    # Only add links from this set. Other are not relevant.
    links_keys = ['Amazon', 'Last.fm', 'iTunes', 'AllMusic', 'Discogs', 'MusicBrainz', 'Spotify', 'Bandcamp',
                  'Wikipedia', 'Pandora', 'Hype Machine']
    links = {}

    # Scrape links from a page
    for external_tag in scrape_external_links(soup):
        # Get the respective kink / href
        for link_a in external_tag.findAll('a', attrs={'class', 'external text'}):
            # Add it to a dict
            key = external_tag.text.split(':')[0].strip()
            if key in links_keys:
                links.setdefault(key, [])
                links[key].append(link_a.get('href'))
    return links


[docs]def scrape_albums(soup):
    """Scrape albums tags, usually from the main artist wiki page.
    This function will successively yield albums.

    .. note::
        The function yield ``<h2>`` tags.

    Args:
        soup (bs4.element.Tag): artist page connection.

    Returns:
        yield bs4.element.Tag: albums tags of an artist page.

    Examples::
        >>> # Import packages
        >>> import bs4  # for web scrapping
        >>> import urllib.request  # to connect

        >>> # Set Up: connect to a lyric wiki page
        >>> USER = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        >>> HEADERS = {'User-Agent': USER}
        >>> URL = 'https://lyrics.fandom.com/wiki/London_Grammar'
        >>> req = urllib.request.Request(URL, headers=HEADERS)
        >>> page = urllib.request.urlopen(req)
        >>> soup = bs4.BeautifulSoup(page, 'lxml')

        >>> # Scrape albums
        >>> for album_tag in scrape_albums(soup):
        ...     print(album_tag.text)
            Strong (2013)
            If You Wait (2013)
            Truth Is a Beautiful Thing (2017)
            Songs on Compilations and Soundtracks
            Additional information
            External links

    """
    yield from soup.select('h2 .mw-headline')


[docs]def scrape_songs(album_h2_tag, li_tag='ol'):
    """Scrape songs from an album. This function should be used to scrape on artist's page.
    The optional parameter ``li_tag`` is used to specify whether or not to scrape for released albums (``'ol'`` tags)
    or covers, singles, live etc. (``'ul'`` tags). They can be combined using ``li_tag=['ol', 'ul']``
    to scrape among all songs.

    Args:
        album_h2_tag (bs4.element.Tag): album tag. Only songs under this tag will be yielded.
        li_tag (string or iterable): tags names to scrape songs from.

    Returns:
        yield bs4.element.Tag: yield song tags corresponding to the album tag.

    Examples::
        >>> # Import packages
        >>> import bs4  # for web scrapping
        >>> import urllib.request  # to connect

        >>> # Set Up: connect to a lyric wiki page
        >>> USER = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        >>> HEADERS = {'User-Agent': USER}
        >>> URL = 'https://lyrics.fandom.com/wiki/London_Grammar'
        >>> req = urllib.request.Request(URL, headers=HEADERS)
        >>> page = urllib.request.urlopen(req)
        >>> soup = bs4.BeautifulSoup(page, 'lxml')

        >>> # Scrape songs from the first album,  'Strong (2013)' EP.
        >>> album_h2_tag = soup.select('h2 .mw-headline')[0].parent
        >>> for song_tag in scrape_albums(album_h2_tag):
        ...     print(song_tag.text)
            Strong
            Feelings

        >>> # Scrape all songs from the artist page
        >>> for album_tag in scrape_albums(soup):
        >>>     album_h2_tag = album_tag.parent
        >>>     for song_tag in scrape_songs(album_h2_tag):
        >>>         print(album_h2_tag.text)
        >>>         print(song_tag.text)
        >>>         print('------------')
            Strong (2013)
            Strong
            Feelings
            ------------
            If You Wait (2013)
            Hey Now
            Stay Awake
            Shyer
            Wasting My Young Years
            Sights
            Strong
            etc. ...

    """
    soup = album_h2_tag.next_sibling
    while soup and soup.name != 'h2':
        if soup.name and soup.name in li_tag:
            for song_tag in soup.select('li'):
                yield song_tag.find('a')
        soup = soup.next_sibling


def scrape_external_links(soup):
    external_h2 = soup.select('#External_links')[0].parent
    external_tag = external_h2.next_sibling
    while external_tag and external_tag.name != 'h2':
        if external_tag.name == 'div':
            yield external_tag
        external_tag = external_tag.next_sibling


[docs]def get_lyrics(soup):
    """Get lyrics from a `Lyric Wiki` song page.

    Returns:
        string

    Examples::
        >>> # Import packages
        >>> import bs4  # for web scrapping
        >>> import urllib.request  # to connect

        >>> # Set Up: connect to a lyric wiki page
        >>> USER = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        >>> HEADERS = {'User-Agent': USER}
        >>> URL = 'https://lyrics.fandom.com/wiki/London_Grammar:Shyer'
        >>> req = urllib.request.Request(URL, headers=HEADERS)
        >>> page = urllib.request.urlopen(req)
        >>> soup = bs4.BeautifulSoup(page, 'lxml')

        >>> # Scrape the lyrics
        >>> lyrics = get_lyrics(soup)
        >>> print(lyrics)
            I'm feeling shyer and the world gets darker
            Hold yourself a little higher
            Bridge that gap just further
            And all your being
            I'd ask you to give it up
            An ancient feeling love
            So beautifully dressed up

            Feeling shyer, I'm feeling shyer
            I'm feeling shyer

            Maybe you should call her
            Deep in the night for her
            And all your being
            I'd ask you to give it up
            I'd ask you to give it up

    """
    lyrics_container = soup.find("div", {'class': 'lyricbox'})
    lyrics = process_lyrics(str(lyrics_container))
    return lyrics


[docs]def get_artist_info(soup):
    """Get additional information about the artist / band.

    Args:
        soup (bs4.element.Tag): connection to a wiki artist page.

    Returns:
        dict

    """
    artist_info_data = {}
    key = 'other'
    artist_info_container = soup.findAll('div', attrs={'class', 'artist-info'})
    for artist_info_table in artist_info_container:
        artist_info_tables = artist_info_table.findAll('div', attrs={'class': 'css-table-cell'})
        for artist_info_cells in artist_info_tables:
            for artist_info_cell in artist_info_cells.children:
                if artist_info_cell.name == 'p':
                    key = artist_info_cell.text.strip().title()
                    key = key[:-1] if key[-1] == ':' else key
                    artist_info_data[key] = None
                elif artist_info_cell.name == 'div':
                    for artist_info_item in artist_info_cell.children:
                        if artist_info_item.name == 'p':
                            artist_info_data[key] = artist_info_item.text.strip()
                        if artist_info_item.name == 'ul':
                            artist_info_data[key] = []
                            artist_info_list = artist_info_item.findAll('a') + artist_info_item.findAll('b')
                            for artist_info_el in artist_info_list:
                                artist_info_data[key].append(artist_info_el.text.strip())
                            artist_info_data[key] = list(set(artist_info_data[key]))

    return artist_info_data