import base64
import requests


client_id = 'b9a089193ceb44b1b4c22c1b09e9dd6d'
with open('spot_api_secret.txt', 'r') as secretfile:
    secret = secretfile.read().strip()


credentials = client_id + ':' + secret
creds_bytes = credentials.encode('ascii')
base64_bytes = base64.b64encode(creds_bytes)
base64_creds = base64_bytes.decode('ascii')


body = {'grant_type': 'client_credentials'}
headers = {'Authorization': 'Basic ' + base64_creds}
r = requests.post('https://accounts.spotify.com/api/token', data=body, headers=headers)
if not r.ok:
    print(r.json())
oauth_tok = r.json()['access_token']


playlist_ids = {'the original source': '1phYgoNC1JfaPCYjeGG3qs', 
                'test':                '23HGYoppjJmcdc7wu8cnCU',
                'hodge podge':         '2B2rzS6EltNSUGB3ydhEvX', 
                'finds':               '4hqF0FM2RNSJbIWt9os18B',
                'Trance':              '1g2LDmft1WbH5Tdj5bQOHU',
                'trancee':             '7xX0QsUDpqTbV8Sv5OkkuX',
                'jazze':               '3ltMQDT0SUEazFuKLsQ8ST',
                'lofi.':               '6ZZKPNj04Dkdcaa56NW8VG',
                'alt 1 :)':            '4AfSnWDVWMeA6NeNjJjYsS',
                'technical':           '0oTpkoyG96RUUkycqAb7O4',
                'deep synth':          '6ffhOVyOSyMaZzgyNqE8gf',
                'synth (and words)':   '3UbXvzFXN4tXRklderuxMb',
                'TSo Chamber Psych':   '6rirvdbul7rDunT5SP5F4m',
                'My Liked Songs':      '6TyaE4jhfivkgIymnH7URL',
                'TSo Everything':      '69fEt9DN5r4JQATi52sRtq',
                'TSo Dream Pop':       '2A5zN7OTP4n64gEtsFEO2Z'
               }
url_fmt = 'https://api.spotify.com/v1/playlists/{}/tracks?market=US&fields=next%2Citems(is_local%2Ctrack.artists)'
playlists = {name:url_fmt.format(pl_id) for (name, pl_id) in playlist_ids.items()}


import time 

def get_pl_tracks(url):
    headers = {'Accept': 'application/json', 
               'Authorization': 'Bearer ' + oauth_tok,
               'Content-Type': 'application/json'}
    r = requests.get(url, headers=headers)
    if not r.ok:
        print(r)
        print('Retry After: ', r.headers['retry-after'])
        code = r.json()['error']['status']
        if code == 429:
            time.sleep(int(r.headers['retry-after']))
            r = requests.get(url, headers=headers)
    r = r.json()
    tracks = r['items']
    if r['next']:
        tracks = tracks + get_pl_tracks(r['next'])
    return tracks


r = get_pl_tracks(playlists['alt 1 :)'])
len(r)

1458


from collections import Counter

def get_pl_artists(url):
    tracks = get_pl_tracks(url)
    artists_adj = Counter()
    for track in tracks:
        if not track['is_local']:
            for artist in track['track']['artists']:
                artists_adj[artist['id']] += 1.0/len(track['track']['artists'])
    return artists_adj, len(tracks)


plartists, num_tracks = get_pl_artists(playlists['alt 1 :)'])
print(list(plartists.items())[27])

('5BvJzeQpmsdsFp4HGUYUEx', 14.5)


def get_pl_genres(pl_url):
    artist_cts, num_tracks = get_pl_artists(pl_url)
    artist_ids = list(artist_cts.keys())
    request_list = []
    artists_info = []
    genre_cts = Counter()
    
    while artist_ids:
        request_list.append(','.join(artist_ids[:50]))
        artist_ids = artist_ids[50:]
    
    for ids in request_list:
        artists_url = 'https://api.spotify.com/v1/artists?ids={}'.format(ids)
        headers = {'Accept': 'application/json', 
               'Authorization': 'Bearer ' + oauth_tok,
               'Content-Type': 'application/json'}
        r = requests.get(artists_url, headers=headers)
        if not r.ok:
            print(r)
            code = r.status_code
            while code == 429:
                print('Retry After: ', r.headers['retry-after'])
                time.sleep(int(r.headers['retry-after']))
                r = requests.get(artists_url, headers=headers)
                code = r.status_code
        r = r.json()
        artists_info = artists_info + r['artists']
    
    for artist in artists_info:
        genres = artist['genres']
        for genre in genres:
            genre_cts[genre] += artist_cts[artist['id']]
            
    tnormd_genres = Counter({k:(v/num_tracks) for k, v in genre_cts.most_common()})
    
    return tnormd_genres, num_tracks


genres, num_tracks = get_pl_genres(playlists['test'])
print(list(genres.items())[0])

('instrumental rock', 1.0)


import matplotlib.pyplot as plt

def plot_genre_distributions(pl_names=playlists.keys()):
    all_genre_distributions = {}
    pls_to_plot = {k:v for k, v in playlists.items() if k in pl_names}
    
    nrows = int((len(pls_to_plot)+1)/2)
    ncols = 2 if len(pls_to_plot) > 1 else 1
    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*7, 4*nrows))
    for i, (name, url) in enumerate(pls_to_plot.items()):
        genres, num_tracks = get_pl_genres(url)
        all_genre_distributions[name] = genres
        if len(pls_to_plot) == 1: ax = axs
        elif len(pls_to_plot) == 2: ax = axs[i]
        else: ax = axs[int(i/2)][i%2]
        labels = list(genres.keys())
        counts = list(genres.values())
        ax.bar(range(len(genres)), counts, width=1, tick_label=labels)

        title = 'Genre Distribution in ' + name
        ax.set_title(title)
        ax.set_ylabel('Genre Relative Frequency')
        plt.setp(ax.get_xticklabels(), rotation=-55, horizontalalignment='left')
        if len(genres) > 40: ax.get_xaxis().set_visible(False)

    fig.tight_layout()
    plt.show()
    return all_genre_distributions


# plot_genre_distributions(['lofi.', 'alt 1 :)', 'technical'])
genre_distributions = plot_genre_distributions()


import pandas as pd

def get_playlist_stats(pl_names=playlists.keys()):
    pls_to_summarize = {k:v for k, v in playlists.items() if k in pl_names}
    
    results = pd.DataFrame(columns=['name', 'top_genre', 
                                    'num_genres', 'num_tracks', 'tracks:genres', 
                                    'genres:tracks', 'avg_relfreq', 
                                    'top_gen_relfreq', 'top:tot_gen'])
    for name, url in pls_to_summarize.items():
        genres, num_tracks = get_pl_genres(url)
        
        num_genres = len(genres)
        top_genre = genres.most_common(1)[0]
        
        result = {'name': name, 
                  'num_genres': float(num_genres), # casting as floats to be able to examine correlation later
                  'num_tracks': float(num_tracks), # 
                  'top_genre': top_genre[0], 
                  'tracks:genres': num_tracks/num_genres,
                  'genres:tracks': num_genres/num_tracks,
                  'top:tot_gen': top_genre[1]/num_genres,
                  'top_gen_relfreq': top_genre[1],
                  'avg_relfreq': sum(genres.values())/num_genres,
                 }
        results = results.append(result, ignore_index = True)
        
    return results


pl_stats = get_playlist_stats()
pl_stats

<Response [429]>
Retry After:  1


import seaborn as sbn

correlations = pl_stats.corr()
sbn.heatmap(correlations, annot=True)
plt.show()


from bs4 import BeautifulSoup

r = requests.get('https://everynoise.com/everynoise1d.cgi?scope=all')
soup = BeautifulSoup(r.content, 'html.parser')


table_soup = soup.find('table').prettify()
print(table_soup[:675]) # just showing the first row to try not to clutter the output too much

<table border="0" cellpadding="0" cellspacing="0">
 <tr class="" valign="top">
  <td align="right" class="note" style="font-size: 20px; line-height: 24px">
   1
  </td>
  <td style="font-size: 20px; line-height: 24px">
   <a class="note" href="https://embed.spotify.com/?uri=spotify:playlist:6gS3HhOiI17QNojjPuPzqc" onclick="linksync('https://embed.spotify.com/?uri=spotify:playlist:6gS3HhOiI17QNojjPuPzqc');" target="spotify" title="See this playlist">
    ☊
   </a>
  </td>
  <td class="note" style="font-size: 20px; line-height: 24px">
   <a href="?root=pop&amp;scope=all" style="color: #AB890D" title="Re-sort the list starting from here.">
    pop
   </a>
  </td>
 </tr>


rows = soup.find("table").find_all("tr")
all_genres = []

for row in rows:
    cells = row.find_all("td")
    genrename = cells[2].get_text()
    all_genres.append(genrename)
    
print(len(all_genres))
all_genres[:10]

5437

['pop',
 'dance pop',
 'post-teen pop',
 'rap',
 'pop rap',
 'rock',
 'latin',
 'hip hop',
 'modern rock',
 'trap latino']


import networkx as nx
import re
import threading
import time

#make graph where we'll store genre distances
genredists = nx.Graph()

# start the timer
start_time = time.time()

# make a copy of the genre list so we can remove from it
genreq = all_genres.copy()

def thread_task(list_lock, remaining, graph_lock, gr):
    dist_getter = re.compile(r'.*acoustic distance: (\d+\.\d+)')
    more_to_process = True
    # we will continue getting data until there is nothing left in genreq
    while more_to_process:
        # get the lock before editing the remaining genre list
        list_lock.acquire()
        if remaining:
            rootgenre = remaining.pop(0) # critical section
            list_lock.release()
            
            # make the request
            url = 'https://everynoise.com/everynoise1d.cgi?root={}&scope=all'.format(rootgenre)
            r = requests.get(url)
            # troubleshooting faulty requests
            while not r.ok:
                print(r)
                r = requests.get(url) # maybe another try will help
            soup = BeautifulSoup(r.content, 'html.parser')
            
            #get all rows
            rows = soup.find("table").find_all("tr")
            # in each row, find the name of the target genre, and the acoustic distance
            for row in rows:
                cells = row.find_all("td")
                targetgenre = cells[2].find('a').get_text()
                distelem = cells[0]
                if not distelem.has_attr('title'):
                    print(str(distelem) + ' ' + rootgenre + ' ' + targetgenre)
                else: 
                    title = distelem['title']
                    dist = float(dist_getter.search(title).group(1))
                    # get the lock for the graph before inserting an edge
                    graph_lock.acquire()
                    gr.add_edge(rootgenre, targetgenre, weight=dist) # critical section
                    graph_lock.release()
                
        else:
            list_lock.release()
            more_to_process = False

# create the locks
list_lock = threading.Lock()
graph_lock = threading.Lock()

# we'll keep track of threads here
threadlist = []

for i in range(16):
    curthread = threading.Thread(target=thread_task, args=(list_lock, genreq, graph_lock, genredists))
    curthread.start()
    threadlist.append(curthread)

# the main thread will keep track of progress and display estimated time remaining, and also back up the graph 
more_to_process = True
backup_counter = 0
while more_to_process:
    list_lock.acquire()
    progress = len(all_genres) - len(genreq) #critical section
    list_lock.release()
    print(str(100*progress/len(all_genres))+'%')
    
    # backup every 15 min
    if backup_counter == 15:
        print('backing up...')
        graph_lock.acquire()
        nx.write_weighted_edgelist(genredists, 'genredistsfromthreads', delimiter='**') #critical section
        graph_lock.release()
        print('backed up.')
        backup_counter = 0
    else:
        backup_counter += 1
    
    # progress report, then wait
    if progress < len(all_genres):
        elapsed_time = time.time() - start_time
        predicted_end_elapsed_time = len(all_genres)*elapsed_time/progress
        predicted_end_time = predicted_end_elapsed_time + start_time
        print('Predicted completion at ' + time.ctime(predicted_end_time))
        time.sleep(60)
    else:
        more_to_process = False
    
    
# when done, wait for all the threads
for t in threadlist:
    t.join()
    
# final write to filesystem
nx.write_weighted_edgelist(genredists, 'genredistsfromthreads', delimiter='**')

print('done')


import urllib

problematic_genres = list(filter(lambda x: ('&' in x) or ('+' in x), all_genres))

dist_getter = re.compile(r'.*acoustic distance: (\d+\.\d+)')

while problematic_genres:
    rootgenre = problematic_genres.pop(0)
    escapedrootgenre = urllib.parse.quote(rootgenre)
    url = 'https://everynoise.com/everynoise1d.cgi?root={}&scope=all'.format(escapedrootgenre)
    r = requests.get(url)
    while not r.ok:
        print(r)
        r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    rows = soup.find("table").find_all("tr")
    for row in rows:
        cells = row.find_all("td")
        targetgenre = cells[2].find('a').get_text()
        if targetgenre in problematic_genres:
            distelem = cells[0]
            if not distelem.has_attr('title'):
                print(str(distelem) + ' ' + rootgenre + ' ' + targetgenre)
            else: 
                title = distelem['title']
                dist = float(dist_getter.search(title).group(1))
                genredists.add_edge(rootgenre, targetgenre, weight=dist)

nx.write_weighted_edgelist(genredists, 'genredistsfromthreadsfull', delimiter='**')

print('done')


import networkx as nx

genredists = nx.read_weighted_edgelist('genredistsfromthreadsfull', delimiter='**')
genredists.size()

14783175


def count_adjusted_distancegraph(genre_distributions, all_genre_distances):
    all_count_adj_dists = {}
    
    for name, genre_distr in genre_distributions.items():
        print('working on', name)
        count_adj_dists = nx.subgraph(all_genre_distances, genre_distr.keys()).copy()
        for from_gen, to_gen, data in count_adj_dists.edges(data=True):
            raw_dist = data['weight']

            avg_relfreq = (genre_distr[from_gen] + genre_distr[to_gen])/2
            
            adj_dist = raw_dist * avg_relfreq
            
            count_adj_dists[from_gen][to_gen]['weight'] = adj_dist
            count_adj_dists[from_gen][to_gen]['weight2'] = raw_dist
        all_count_adj_dists[name] = count_adj_dists
    
    return all_count_adj_dists


d = {k:v for (k,v) in genre_distributions.items() if k in ['technical']}

a = count_adjusted_distancegraph(genre_distributions, genredists)

working on the original source
working on test
working on hodge podge
working on finds
working on Trance
working on trancee
working on jazze
working on lofi.
working on alt 1 :)
working on technical
working on deep synth
working on synth (and words)
working on TSo Chamber Psych
working on My Liked Songs
working on TSo Everything
working on TSo Dream Pop


pl_stats['adjusted_distances'] = pl_stats['name'].map(lambda x: a[x].size(weight='weight')/a[x].size())
pl_stats['raw_distances'] = pl_stats['name'].map(lambda x: a[x].size(weight='weight2')/a[x].size())
pl_stats


correlations = pl_stats.corr()
sbn.heatmap(correlations, annot=True)
plt.show()


import numpy as np
from sklearn.decomposition import PCA

features = pl_stats[['tracks:genres',  'genres:tracks', 'avg_relfreq', 'top_gen_relfreq', 
                     'top:tot_gen', 'adjusted_distances', 'raw_distances']]

pca = PCA(n_components=2)
pca.fit(features)
print(pca.explained_variance_ratio_)
new_cols = np.transpose(pca.transform(features))
pl_stats_analyzed = pl_stats.copy()
pl_stats_analyzed['principal_component_1'] = new_cols[0]
pl_stats_analyzed['principal_component_2'] = new_cols[1]
pl_stats_analyzed

[0.8587668 0.1091602]


correlations = pl_stats_analyzed.corr()
sbn.heatmap(correlations, annot=True)
plt.show()

	name	top_genre	num_genres	num_tracks	tracks:genres	genres:tracks	avg_relfreq	top_gen_relfreq	top:tot_gen
0	the original source	edm	58.0	42.0	0.724138	1.380952	0.090722	0.682540	0.011768
1	test	instrumental rock	1.0	7.0	7.000000	0.142857	1.000000	1.000000	1.000000
2	hodge podge	pop	265.0	224.0	0.845283	1.183036	0.012930	0.166295	0.000628
3	finds	modern rock	356.0	174.0	0.488764	2.045977	0.009346	0.080460	0.000226
4	Trance	electronica	59.0	67.0	1.135593	0.880597	0.064255	0.656716	0.011131
5	trancee	electronica	54.0	38.0	0.703704	1.421053	0.072531	0.785088	0.014539
6	jazze	jazz	89.0	72.0	0.808989	1.236111	0.057220	0.413194	0.004643
7	lofi.	lo-fi beats	66.0	541.0	8.196970	0.121996	0.027540	0.688848	0.010437
8	alt 1 :)	modern rock	497.0	1458.0	2.933602	0.340878	0.008769	0.326360	0.000657
9	technical	instrumental math rock	40.0	50.0	1.250000	0.800000	0.094583	0.656667	0.016417
10	deep synth	spacewave	38.0	21.0	0.552632	1.809524	0.082707	0.238095	0.006266
11	synth (and words)	chillwave	140.0	140.0	1.000000	1.000000	0.026480	0.164286	0.001173
12	TSo Chamber Psych	chamber psych	209.0	388.0	1.856459	0.538660	0.020480	0.919029	0.004397
13	My Liked Songs	modern rock	1058.0	5415.0	5.118147	0.195383	0.003901	0.149554	0.000141
14	TSo Everything	classical	5431.0	5437.0	1.001105	0.998896	0.000583	0.011669	0.000002
15	TSo Dream Pop	dream pop	218.0	434.0	1.990826	0.502304	0.033873	0.975806	0.004476

	name	top_genre	num_genres	num_tracks	tracks:genres	genres:tracks	avg_relfreq	top_gen_relfreq	top:tot_gen	adjusted_distances	raw_distances
0	the original source	edm	58.0	42.0	0.724138	1.380952	0.090722	0.682540	0.011768	0.180952	2.224092
1	test	instrumental rock	1.0	7.0	7.000000	0.142857	1.000000	1.000000	1.000000	0.000000	0.000000
2	hodge podge	pop	265.0	224.0	0.845283	1.183036	0.012930	0.166295	0.000628	0.035859	3.057757
3	finds	modern rock	356.0	174.0	0.488764	2.045977	0.009346	0.080460	0.000226	0.031185	3.658833
4	Trance	electronica	59.0	67.0	1.135593	0.880597	0.064255	0.656716	0.011131	0.158337	2.757615
5	trancee	electronica	54.0	38.0	0.703704	1.421053	0.072531	0.785088	0.014539	0.193318	2.859429
6	jazze	jazz	89.0	72.0	0.808989	1.236111	0.057220	0.413194	0.004643	0.166335	3.265737
7	lofi.	lo-fi beats	66.0	541.0	8.196970	0.121996	0.027540	0.688848	0.010437	0.065826	2.461002
8	alt 1 :)	modern rock	497.0	1458.0	2.933602	0.340878	0.008769	0.326360	0.000657	0.019832	2.631990
9	technical	instrumental math rock	40.0	50.0	1.250000	0.800000	0.094583	0.656667	0.016417	0.206686	2.472184
10	deep synth	spacewave	38.0	21.0	0.552632	1.809524	0.082707	0.238095	0.006266	0.206647	2.452852
11	synth (and words)	chillwave	140.0	140.0	1.000000	1.000000	0.026480	0.164286	0.001173	0.056141	2.372341
12	TSo Chamber Psych	chamber psych	209.0	388.0	1.856459	0.538660	0.020480	0.919029	0.004397	0.046829	2.747842
13	My Liked Songs	modern rock	1058.0	5415.0	5.118147	0.195383	0.003901	0.149554	0.000141	0.012819	3.799406
14	TSo Everything	classical	5431.0	5437.0	1.001105	0.998896	0.000583	0.011669	0.000002	0.002508	4.400012
15	TSo Dream Pop	dream pop	218.0	434.0	1.990826	0.502304	0.033873	0.975806	0.004476	0.068606	2.284304

	name	top_genre	num_genres	num_tracks	tracks:genres	genres:tracks	avg_relfreq	top_gen_relfreq	top:tot_gen	adjusted_distances	raw_distances	principal_component_1	principal_component_2
0	the original source	edm	58.0	42.0	0.724138	1.380952	0.090722	0.682540	0.011768	0.180952	2.224092	-1.431518	0.809935
1	test	instrumental rock	1.0	7.0	7.000000	0.142857	1.000000	1.000000	1.000000	0.000000	0.000000	5.362862	1.920514
2	hodge podge	pop	265.0	224.0	0.845283	1.183036	0.012930	0.166295	0.000628	0.035859	3.057757	-1.463473	-0.127194
3	finds	modern rock	356.0	174.0	0.488764	2.045977	0.009346	0.080460	0.000226	0.031185	3.658833	-2.083773	-0.595777
4	Trance	electronica	59.0	67.0	1.135593	0.880597	0.064255	0.656716	0.011131	0.158337	2.757615	-1.046092	0.205160
5	trancee	electronica	54.0	38.0	0.703704	1.421053	0.072531	0.785088	0.014539	0.193318	2.859429	-1.573893	0.252113
6	jazze	jazz	89.0	72.0	0.808989	1.236111	0.057220	0.413194	0.004643	0.166335	3.265737	-1.534280	-0.240433
7	lofi.	lo-fi beats	66.0	541.0	8.196970	0.121996	0.027540	0.688848	0.010437	0.065826	2.461002	5.930100	-0.989048
8	alt 1 :)	modern rock	497.0	1458.0	2.933602	0.340878	0.008769	0.326360	0.000657	0.019832	2.631990	0.785922	-0.156508
9	technical	instrumental math rock	40.0	50.0	1.250000	0.800000	0.094583	0.656667	0.016417	0.206686	2.472184	-0.866403	0.449045
10	deep synth	spacewave	38.0	21.0	0.552632	1.809524	0.082707	0.238095	0.006266	0.206647	2.452852	-1.742142	0.547040
11	synth (and words)	chillwave	140.0	140.0	1.000000	1.000000	0.026480	0.164286	0.001173	0.056141	2.372341	-1.151669	0.467716
12	TSo Chamber Psych	chamber psych	209.0	388.0	1.856459	0.538660	0.020480	0.919029	0.004397	0.046829	2.747842	-0.276692	0.102760
13	My Liked Songs	modern rock	1058.0	5415.0	5.118147	0.195383	0.003901	0.149554	0.000141	0.012819	3.799406	2.681505	-1.722328
14	TSo Everything	classical	5431.0	5437.0	1.001105	0.998896	0.000583	0.011669	0.000002	0.002508	4.400012	-1.539881	-1.439574
15	TSo Dream Pop	dream pop	218.0	434.0	1.990826	0.502304	0.033873	0.975806	0.004476	0.068606	2.284304	-0.050574	0.516579

Analysis of Spotify Playlists¶

The Playlist Cohesometer¶

(aka... Playlist Eclectometer)¶

Christopher Baillie Olin¶

Background¶

This Project¶

Gathering Data¶

Spotify API Authorization¶

Getting Playlist Data¶

Data Exploration¶

Visualization¶

Playlist Statistics¶

More Data Collection¶

The case for collecting more data¶

Scraping Acoustic Distance Between Genres¶

Hypothesis Testing and Machine Learning¶

Insights¶

¶