# mounting my google drive
from google.colab import drive
drive.mount('/content/drive')
# changing the directory to my folder designated for this project
%cd /content/drive/My Drive/TU/SEMESTERS/f2023/datascience

Mounted at /content/drive
[Errno 2] No such file or directory: '/content/drive/My Drive/TU/SEMESTERS/f2023/datascience'
/content


# importing important packages I will need
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import random


import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names, but PCA was fitted with feature names")


track_features = pd.read_csv('//content/drive/MyDrive/TU/SEMESTERS/f2023/data science/tracks_features.csv')
# the ids were alphanumeric and meaningless, will create numeric ids later
track_features = track_features.drop(columns = {'id', 'album_id', 'artist_ids'})
# the artists column is a lists of artists, taking the primary artist only
track_features[['primary_artist']] = track_features['artists'].str.split("'", expand=True)[[1]]
# track number and disc number are irrelevant to the analyses we will be performing
track_features = track_features.drop(columns = {'artists', 'track_number', 'disc_number'})
track_features.head()


track_features['release_date'] = pd.to_datetime(track_features['release_date'], format='yyyy-mm-dd', errors='coerce')
track_features.dtypes

name                        object
album                       object
explicit                      bool
danceability               float64
energy                     float64
key                          int64
loudness                   float64
mode                         int64
speechiness                float64
acousticness               float64
instrumentalness           float64
liveness                   float64
valence                    float64
tempo                      float64
duration_ms                  int64
time_signature             float64
year                         int64
release_date        datetime64[ns]
primary_artist              object
dtype: object


# IMPORTING EACH OF THE DATASETS FROM MY GOOGLE DRIVE AND ADDING A CATEGORY BASED ON THE DATAFRAME THEY ARE IN
hip_hop = pd.read_csv('//content/drive/MyDrive/TU/SEMESTERS/f2023/data science/hiphop_music_data.csv')
hip_hop['main_cat'] = 'hip hop'

indie = pd.read_csv('//content/drive/MyDrive/TU/SEMESTERS/f2023/data science/indie_alt_music_data.csv')
indie['main_cat'] = 'indie'

metal = pd.read_csv('//content/drive/MyDrive/TU/SEMESTERS/f2023/data science/metal_music_data.csv')
metal['main_cat'] = 'metal'

pop = pd.read_csv('//content/drive/MyDrive/TU/SEMESTERS/f2023/data science/pop_music_data.csv')
pop['main_cat'] = 'pop'

rock = pd.read_csv('//content/drive/MyDrive/TU/SEMESTERS/f2023/data science/rock_music_data.csv')
rock['main_cat'] = 'rock'

## CONCAT EACH FRAME INTO ONE LARGE FRAME
multi_genre = pd.concat([hip_hop, indie, metal, pop, rock], ignore_index= True)
# EACH SONG HAS A LIST OF GENRES DESIGNATED BY SPOTIFY, I HAVE SELECTED THREE OF THESE SUBGENRES TO INCLUDE IN MY DATA
multi_genre[['subgenre1', 'subgenre2', 'subgenre3']] = multi_genre['Genres'].str.split("'", expand=True)[[1, 3, 5]]
# DROPPING UNNECESSARY COLUMNS
multi_genre = multi_genre.drop(columns = {
       'id', 'uri', 'track_href', 'analysis_url',
       'time_signature', 'Genres'})
multi_genre.head()


multi_genre.dtypes

Artist Name          object
Track Name           object
Popularity            int64
Playlist             object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms           int64
main_cat             object
subgenre1            object
subgenre2            object
subgenre3            object
dtype: object


# CHANGING THE NAMES OF ARTIST AND TRACK IN MULTIGENRE FOR EASE OF MERGING
multi_genre= multi_genre.rename(columns = {"Artist Name" : "primary_artist", "Track Name" : "name"})
combo = pd.concat([multi_genre, track_features], ignore_index= True)
# DROPPING COLUMNS NOT SHARED BETWEEEN THE TWO FRAMES
combo = combo.drop(columns = {'album','explicit',
       'time_signature', 'year', 'release_date', 'Popularity'})
# THERE IS BOUND TO BE OVERLAP BETWEEN THE TWO DATAFRAMES, DROPPING DUPLICATES BASED ON ARTIST NAME, SONG NAME PAIRINGS
combo = combo.drop_duplicates(subset = ['primary_artist', 'name'], keep = 'first')
combo.head()


print(f"There are {combo.main_cat.value_counts().sum()} labeled rows remaining in our dataframe and a total of {len(combo)} unique songs to build our model with")

There are 16503 labeled rows remaining in our dataframe and a total of 1140107 unique songs to build our model with


# HOW MUCH INFORMATION DO WE HAVE FROM EACH COLUMN
combo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1140107 entries, 0 to 1226566
Data columns (total 19 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   primary_artist    1140107 non-null  object 
 1   name              1140107 non-null  object 
 2   Playlist          16503 non-null    object 
 3   danceability      1140107 non-null  float64
 4   energy            1140107 non-null  float64
 5   key               1140107 non-null  int64  
 6   loudness          1140107 non-null  float64
 7   mode              1140107 non-null  int64  
 8   speechiness       1140107 non-null  float64
 9   acousticness      1140107 non-null  float64
 10  instrumentalness  1140107 non-null  float64
 11  liveness          1140107 non-null  float64
 12  valence           1140107 non-null  float64
 13  tempo             1140107 non-null  float64
 14  duration_ms       1140107 non-null  int64  
 15  main_cat          16503 non-null    object 
 16  subgenre1         14759 non-null    object 
 17  subgenre2         12055 non-null    object 
 18  subgenre3         9489 non-null     object 
dtypes: float64(9), int64(3), object(7)
memory usage: 174.0+ MB


# Only showing the graphs for songs with labels
subset = combo[combo['main_cat'].notna()].copy()
subsetx = subset[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
subsety = subset[['main_cat']]


# SUMMARY STATS
subsetx.describe()


# AVERAGE FEATURES BY MAIN CATEGORY
subset.groupby(['main_cat'])[subsetx.columns].mean()


## scaling the data
scaler = StandardScaler()
subsetx = scaler.fit_transform(subsetx)


# COMPRESSING OUR DATA INTO TWO DIMENSIONS
pca = PCA(n_components=2)
X_pca = pca.fit_transform(subsetx)


# Graph One: Similarity of songs across multiple vectors
# creting a dataframe and merging the main genre type
X_pca = pd.DataFrame(X_pca, columns=["Component One", "Component Two"])
X_pca = pd.merge(subsety[['main_cat']], X_pca, left_index = True, right_index = True)
# graphing the similarity features and coloring by main genre type
sns.scatterplot(x="Component One", y="Component Two", hue="main_cat", data=X_pca)
plt.title("Song Similarity by Genre Type")
plt.legend(loc='best', bbox_to_anchor=(1, 0.5))

<matplotlib.legend.Legend at 0x7c26176b01f0>


# Graph Two - Showing Neighborhoods Shape by Genre Type
## utilizing facetgrid to plot all five categories from main_cat
neighborhoods = sns.FacetGrid(X_pca, col="main_cat", sharex=True, sharey=True)
## graphing the kernel density to see where each genre roughly lies
neighborhoods.map(sns.kdeplot, "Component One", "Component Two", color="#d5f979ff", fill=True)
# establishing the title as each graph as the name of the column
neighborhoods.set_titles(col_template="{col_name}")

<seaborn.axisgrid.FacetGrid at 0x7c2614ce6ce0>


## Graph Three:
## is there similarity within playlist? and differentiability across playlists
## limiting to songs that are a part of a playlist that is represented at least 100 times in the dataframe
value_counts = subset['Playlist'].value_counts()
subset = value_counts[value_counts > 95].index
subset = combo[combo['Playlist'].isin(subset)].reset_index()


## pulling the x variables based on numerical song features
subsetx = subset[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

# scaling the data
scaler = StandardScaler()
subsetx = scaler.fit_transform(subsetx)
## y is the playlist the songs belong to
subsety = subset[['Playlist']]

## compressing the data into two columns
pca = PCA(n_components=2)
X_pca = pca.fit_transform(subsetx)
# creting a dataframe and merging the main genre type
X_pca = pd.DataFrame(X_pca, columns=["Component One", "Component Two"])
X_pca = pd.merge(subsety[['Playlist']], X_pca, left_index = True, right_index = True)

## graphing the kernel density to see where each playlist roughly lies
neighborhoods = sns.FacetGrid(X_pca, col="Playlist", sharex=True, sharey=True)
neighborhoods.map(sns.kdeplot, "Component One", "Component Two", color="#d5f979ff", fill=True,warn_singular=False)
neighborhoods.set_titles(col_template="{col_name}")

<seaborn.axisgrid.FacetGrid at 0x7c2621e8fdc0>


# OUR X VARIABLES WILL BE THE 11 QUANTITATIVE VARIABLES SHARED BETEWEEN THE TWO DATASETS
X = combo[['danceability','energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness','instrumentalness', 'liveness', 'valence', 'tempo']]
# OUR FIRST CLUSTERING MODEL WILL BE BASED ON THE MAIN CATEGORY ASSIGNED TO SONGS
Y = combo[['main_cat']]


# INITIALIZING A KMEANS CLUSTERING MODEL THAT CLUSTERS ALL THE SONGS BASED ON QUANTITATIVE SIMILARITY
# I CHOSE 15 CLUSTERS TO ENSURE THAT ALL FIVE MAIN CATEGORIES WERE REPRESENTATED
kmeans = KMeans(n_clusters=15, n_init=10)
X_dist = kmeans.fit_transform(X)
# DISTANCE OF EACH TRACK FROM THE 15 CENTERS
X_dist

array([[31.15891194, 21.89261763, 85.64627615, ..., 19.03262616,
        41.36822771, 53.49055382],
       [29.76190728, 22.21242228, 84.0957667 , ..., 18.05660452,
        39.91911081, 52.0010951 ],
       [12.59721845, 51.99022558, 44.25780406, ..., 34.50300297,
         6.46374311, 13.41742872],
       ...,
       [ 5.44831397, 45.98845177, 52.22592018, ..., 29.05817349,
         8.8452366 , 20.29485137],
       [ 6.14958748, 48.27482269, 49.04708616, ..., 31.00539346,
         5.0120271 , 16.89190819],
       [ 3.61649487, 40.96733216, 55.99843587, ..., 23.85519134,
        12.0584064 , 23.99623567]])


# WE ONLY WANT TO CREATE CENTER LABELS BASED ON THE SONG MOST CLOSE TO EACH CENTER W/ A NON NULL MAIN CAT
valid_rows = ~pd.isna(Y['main_cat'])
filtered_df = Y[valid_rows]


# FILTERING THE LISTS OF SONGS TO ONLY BE THOSE WITH AN OUTCOME LABEL
X_dist_filtered = X_dist[valid_rows]
# SETTING THE CENTER LABEL TO BE THE SONG WITH MIN DISTANCE FROM THE CENTER
representative_label_idx = np.argmin(X_dist_filtered, axis=0)
representative_label_idx = Y[valid_rows].index[representative_label_idx]
representative_label_idx

Int64Index([16244,  3899, 14435,  6176, 19799,  3533,  5851, 21316, 15039,
            12440,  6570,  7132,  4112, 14786, 10343],
           dtype='int64')


#QUANTITATIVE FEATURES RELATED TO THE CENTER SONGS
X_representative_labels = X.loc[representative_label_idx]
 # LABELS OF THE SONGS AT THE CENTER
y_representative_labels = np.array(Y.loc[representative_label_idx])


# PROPOGATING GENRE ONTO EVERY SONG IN X BASED ON THE CLUSTER IT BELONGED TO
y_propagated = np.empty(len(X), dtype=object)
for i in range(15):
    y_propagated[kmeans.labels_==i] = y_representative_labels[i]


# CREATING A NEW COLUMN THAT DISPLAYS THE PROPOGATED MAIN CATEGORY
combo['main_cat_prop'] = y_propagated
# REPLACING MAIN CATEGORY WITH PROPOGATED LABEL IF THE VALUE IS NULL
combo['main_cat'] = combo['main_cat'].fillna(combo['main_cat_prop'])
combo.head()


# COMPRESSING THE DATA INTO TWO DIMENSIONS FOR SCATTER PLOT
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# THE LABELS ARE THE PROPAGATED MAIN CATEGORY
labels = combo.main_cat
# CREATING A DATA FRAME WHERE PCA1 IS THE FIRST COMPONENT AND PCA2 IS THE SECOND
# HUE WILL BE ACCORDING TO LABEL
visualization = pd.DataFrame({'PCA1': X_pca[:, 0], 'PCA2': X_pca[:, 1], 'PROP_GENRE': labels})
visualization.head()


# PLOTTING COMPONENTS BY PROPAGATED GENRE TYPE, SIZE AND TRANSPARENCY ADJUSTED FOR VISABILITY
sns.scatterplot(x='PCA1', y='PCA2', hue='PROP_GENRE', palette='viridis', data=visualization, s=25, alpha=0.50)
# ADDING X MARKERS AT THE CENTERS OF EACH CLUSTER
# APPLYING THE SAME PCA TRANSFORMATION TO THESE CENTER POINTS
centers_pca = pca.transform(kmeans.cluster_centers_)
sns.scatterplot(x=centers_pca[:, 0], y=centers_pca[:, 1], marker='X', s=100, color='black')
# ADDING TITLES AND LABELS
plt.title('KMEANS CLUSTERING RESULTS - PROPAGATED MAIN GENRE')
plt.xlabel('PCA ONE')
plt.ylabel('PCA TWO')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol = 3)
plt.show()


# OUR X VARIABLES WILL BE THE 11 QUANTITATIVE VARIABLES SHARED BETEWEEN THE TWO DATASETS
X = combo[['danceability','energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness','instrumentalness', 'liveness', 'valence', 'tempo']]
# OUR SECOND CLUSTERING MODEL WILL BE BASED ON THE SUBGENRE 1 ASSIGNED TO SONGS
Y = combo[['subgenre1']]


k = 40
# INITIALIZING A KMEANS CLUSTERING MODEL THAT CLUSTERS ALL THE SONGS BASED ON QUANTITATIVE SIMILARITY
kmeans = KMeans(n_clusters=k, n_init=10)
X_dist = kmeans.fit_transform(X)
# DISTANCE OF EACH TRACK FROM THE 40 CENTERS
X_dist

array([[25.1083022 , 85.95949778, 11.05361464, ..., 73.21512041,
        33.996832  , 12.99629074],
       [23.50144056, 84.32654616, 13.1964748 , ..., 71.70612625,
        34.44770049, 13.05947392],
       [19.24057569, 44.30261329, 49.75088499, ..., 32.23701508,
        63.6748783 , 43.09684309],
       ...,
       [11.31588511, 53.21679765, 41.65724003, ..., 39.78492223,
        58.36012168, 36.24151623],
       [12.37381669, 49.98113267, 44.66237502, ..., 36.6074394 ,
        60.53586021, 38.67251906],
       [ 4.89636053, 56.15945321, 38.12222798, ..., 43.73468664,
        53.2510428 , 31.42289452]])


# WE ONLY WANT TO CREATE CENTER LABELS BASED ON THE SONG MOST CLOSE TO EACH CENTER W/ A NON NULL SUBGENRE1
valid_rows = ~pd.isna(Y['subgenre1'])
filtered_df = Y[valid_rows]


# FILTERING THE LISTS OF SONGS TO ONLY BE THOSE WITH AN OUTCOME LABEL
X_dist_filtered = X_dist[valid_rows]
# SETTING THE CENTER LABEL TO BE THE SONG WITH MIN DISTANCE FROM THE CENTER
representative_label_idx = np.argmin(X_dist_filtered, axis=0)
# INDEX OF SONGS AT THE CENTER WITH A LABEL
representative_label_idx = Y[valid_rows].index[representative_label_idx]
representative_label_idx

Int64Index([14778, 19573, 18963,   117,  8758, 14512,  6403, 18729, 15039,
            11078,  3781, 10435,  6873, 17176, 14817,  6021, 20395, 15504,
             3077,  2110,   899,  2897,  3458, 14856,  5259,  3899, 14638,
             6406, 18891,  6141, 21245, 20720, 13566, 16646,  2009, 11199,
              281,  8183,  6802,  3539],
           dtype='int64')


#QUANTITATIVE FEATURES RELATED TO THE CENTER SONGS
X_representative_labels = X.loc[representative_label_idx]
# LABELS OF THE SONGS AT THE CENTER
y_representative_labels = np.array(Y.loc[representative_label_idx])


# PROPOGATING SUBGENRE 1 ONTO EVERY SONG IN X BASED ON THE CLUSTER IT BELONGED TO
y_propagated = np.empty(len(X), dtype= object)
for i in range(k):
    y_propagated[kmeans.labels_==i] = y_representative_labels[i]


# CREATING A NEW COLUMN FOR SUBGENRE PROPOGATION
combo['subgenre1_prop'] = y_propagated
# FILLING SONGS THAT ARE MISSING A SUBGENRE 1 WITH THE PROPOGATED LABEL
combo['subgenre1'] = combo['subgenre1'].fillna(combo['subgenre1_prop'])
combo.head()


# COMPRESSING THE DATA INTO TWO DIMENSIONS FOR SCATTER PLOT
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# THE LABELS ARE THE PROPAGATED SUBGENRE
labels = combo.subgenre1_prop
# CREATING A DATA FRAME WHERE PCA1 IS THE FIRST COMPONENT AND PCA2 IS THE SECOND
# HUE WILL BE ACCORDING TO LABEL
visualization = pd.DataFrame({'PCA1': X_pca[:, 0], 'PCA2': X_pca[:, 1], 'PROP_SUBGENRE': labels})
visualization.head()


# PLOTTING COMPONENTS BY PROPAGATED SUBGENRE -- TYPE, SIZE AND TRANSPARENCY ADJUSTED FOR VISABILITY
sns.scatterplot(x='PCA1', y='PCA2', hue='PROP_SUBGENRE', palette='viridis', data=visualization, s=25, alpha=0.50)
# ADDING X MARKERS AT THE CENTERS OF EACH CLUSTER
# APPLYING THE SAME PCA TRANSFORMATION TO THESE CENTER POINTS
centers_pca = pca.transform(kmeans.cluster_centers_)
sns.scatterplot(x=centers_pca[:, 0], y=centers_pca[:, 1], marker='X', s=100, color='black')
# ADDING TITLES AND LABELS
plt.title('KMEANS CLUSTERING RESULTS - PROPAGATED SUBGENRE ONE')
plt.xlabel('PCA ONE')
plt.ylabel('PCA TWO')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol = 3)
plt.show()


# OUR X VARIABLES WILL BE THE 11 QUANTITATIVE VARIABLES SHARED BETEWEEN THE TWO DATASETS
X = combo[['danceability','energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness','instrumentalness', 'liveness', 'valence', 'tempo']]
# OUR THIRD CLUSTERING MODEL WILL BE BASED ON THE SECOND SUBGENRE ASSIGNED TO SONGS
Y = combo[['subgenre2']]


k = 40
# INITIALIZING A KMEANS CLUSTERING MODEL THAT CLUSTERS ALL THE SONGS BASED ON QUANTITATIVE SIMILARITY
kmeans = KMeans(n_clusters= k, n_init=10)
X_dist = kmeans.fit_transform(X)
X_dist

array([[ 51.00447442, 114.4736377 ,  23.78418377, ...,  33.12057127,
         88.70925856,   5.220893  ],
       [ 49.38043161, 112.91124966,  22.77184009, ...,  31.50895688,
         87.18758908,   3.91774277],
       [ 18.28638286,  73.026865  ,  35.36710579, ...,  12.94833504,
         47.65010338,  38.72239557],
       ...,
       [ 25.3957725 ,  81.00149986,  31.17561528, ...,   7.39156092,
         55.26540111,  30.33794077],
       [ 23.12421815,  77.89020025,  32.79023015, ...,   5.57426529,
         52.14446556,  32.6969158 ],
       [ 24.73019429,  84.87283327,  25.90828261, ...,   3.80498828,
         59.26497474,  25.47548837]])


# WE ONLY WANT TO CREATE CENTER LABELS BASED ON THE SONG MOST CLOSE TO EACH CENTER W/ A NON NULL SUBGENRE 2
valid_rows = ~pd.isna(Y['subgenre2'])
filtered_df = Y[valid_rows]


# FILTERING THE LISTS OF SONGS TO ONLY BE THOSE WITH AN OUTCOME LABEL
X_dist_filtered = X_dist[valid_rows]
# SETTING THE CENTER LABEL TO BE THE SONG WITH MIN DISTANCE FROM THE CENTER
representative_label_idx = np.argmin(X_dist_filtered, axis=0)
# INDEXES OF SONGS AT THE CENTER WITH A LABEL
representative_label_idx = Y[valid_rows].index[representative_label_idx]
representative_label_idx

Int64Index([ 6873,  2897,  5385,  2547,  9295,  3318, 21454, 14512,  6731,
             9441, 15039,   281,  4779,  3533,  2625, 11469,  8829, 19195,
             2609,  9802,  8229, 20720,   291, 18891,  2009,  8616, 20395,
             3521,  3604, 21905, 19573,   674,  6802, 17156, 17490, 16044,
             6750, 15641, 16511,     7],
           dtype='int64')


#QUANTITATIVE FEATURES RELATED TO THE CENTER SONGS
X_representative_labels = X.loc[representative_label_idx]
# LABELS OF THE SONGS AT THE CENTER
y_representative_labels = np.array(Y.loc[representative_label_idx])


# PROPOGATING GENRE ONTO EVERY SONG IN X BASED ON THE CLUSTER IT BELONGED TO
y_propagated = np.empty(len(X), dtype= object)
for i in range(k):
    y_propagated[kmeans.labels_==i] = y_representative_labels[i]


# CREATING A SUBGENRE PROPAGATION LABEL IN COMBO AND FILLING SONGS THAT ARE MISSING A SUBGENRE 2
combo['subgenre2_prop'] = y_propagated
combo['subgenre2'] = combo['subgenre2'].fillna(combo['subgenre2_prop'])
combo.head()


# COMPRESSING THE DATA INTO TWO DIMENSIONS FOR SCATTER PLOT
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# THE LABELS ARE THE PROPAGATED SUBGENRE
labels = combo.subgenre2_prop
# CREATING A DATA FRAME WHERE PCA1 IS THE FIRST COMPONENT AND PCA2 IS THE SECOND
# HUE WILL BE ACCORDING TO LABEL
visualization = pd.DataFrame({'PCA1': X_pca[:, 0], 'PCA2': X_pca[:, 1], 'PROP_SUBGENRE': labels})
visualization.head()


# PLOTTING COMPONENTS BY PROPAGATED SUBGENRE -- TYPE, SIZE AND TRANSPARENCY ADJUSTED FOR VISABILITY
sns.scatterplot(x='PCA1', y='PCA2', hue='PROP_SUBGENRE', palette='viridis', data=visualization, s=25, alpha=0.50)
# ADDING X MARKERS AT THE CENTERS OF EACH CLUSTER
# APPLYING THE SAME PCA TRANSFORMATION TO THESE CENTER POINTS
centers_pca = pca.transform(kmeans.cluster_centers_)
sns.scatterplot(x=centers_pca[:, 0], y=centers_pca[:, 1], marker='X', s=100, color='black')
# ADDING TITLES AND LABELS
plt.title('KMEANS CLUSTERING RESULTS - PROPAGATED SUBGENRE TWO')
plt.xlabel('PCA ONE')
plt.ylabel('PCA TWO')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol = 3)
plt.show()


# OUR X VARIABLES WILL BE THE 11 QUANTITATIVE VARIABLES SHARED BETEWEEN THE TWO DATASETS
X = combo[['danceability','energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness','instrumentalness', 'liveness', 'valence', 'tempo']]
# OUR FOURTH CLUSTERING MODEL WILL BE BASED ON THE THIRD SUBGENRE ASSIGNED TO SONGS
Y = combo[['subgenre3']]


k = 40
# INITIALIZING A KMEANS CLUSTERING MODEL THAT CLUSTERS ALL THE SONGS BASED ON QUANTITATIVE SIMILARITY
kmeans = KMeans(n_clusters= k, n_init=10)
X_dist = kmeans.fit_transform(X)
X_dist

array([[45.13715268,  7.92447058, 30.53948862, ..., 22.08749767,
        50.43472261, 20.92426365],
       [43.57367502,  8.88037147, 28.93093746, ..., 23.74953841,
        48.81881489, 19.61524431],
       [ 7.94765322, 38.20600656, 14.76507954, ..., 62.56799643,
        13.21453344, 26.74972408],
       ...,
       [14.83409063, 29.94279672,  7.97589674, ..., 54.65558718,
        18.43001467, 21.14214005],
       [11.7806233 , 32.9787755 ,  7.47522411, ..., 57.44587851,
        14.5326016 , 22.96490718],
       [15.93506956, 26.7270106 ,  1.81552604, ..., 50.43056519,
        20.89266012, 15.95220671]])


# WE ONLY WANT TO CREATE CENTER LABELS BASED ON THE SONG MOST CLOSE TO EACH CENTER W/ A NON NULL SUBGENRE 3
valid_rows = ~pd.isna(Y['subgenre3'])
filtered_df = Y[valid_rows]


# FILTERING THE LISTS OF SONGS TO ONLY BE THOSE WITH AN OUTCOME LABEL
X_dist_filtered = X_dist[valid_rows]
# SETTING THE CENTER LABEL TO BE THE SONG WITH MIN DISTANCE FROM THE CENTER
representative_label_idx = np.argmin(X_dist_filtered, axis=0)
representative_label_idx = Y[valid_rows].index[representative_label_idx]
representative_label_idx

Int64Index([ 2844, 13566, 11264,  2897,  2609, 17490,  6802,  3706,  2343,
             3318, 14101, 19641,  8758,  3533,  6873, 18991, 17343,  7417,
            11199,   291, 15039,  6021, 16646,  7260, 10970,  3521, 11469,
             8662,  3269,  3458,  4326, 12023, 18259, 15340, 19573,  2362,
             2559,   281,  8693, 16954],
           dtype='int64')


#QUANTITATIVE FEATURES RELATED TO THE CENTER SONGS
X_representative_labels = X.loc[representative_label_idx]
 # LABELS OF THE SONGS AT THE CENTER
y_representative_labels = np.array(Y.loc[representative_label_idx])


# PROPOGATING GENRE ONTO EVERY SONG IN X BASED ON THE CLUSTER IT BELONGED TO
y_propagated = np.empty(len(X), dtype=object)
for i in range(k):
    y_propagated[kmeans.labels_==i] = y_representative_labels[i]


# ADDING SUBGENRE 3 PROPAGATIONS TO THE DATASET
combo['subgenre3_prop'] = y_propagated
combo['subgenre3'] = combo['subgenre3'].fillna(combo['subgenre3_prop'])
combo.head()


# COMPRESSING THE DATA INTO TWO DIMENSIONS FOR SCATTER PLOT
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# THE LABELS ARE THE PROPAGATED SUBGENRE
labels = combo.subgenre3_prop
# CREATING A DATA FRAME WHERE PCA1 IS THE FIRST COMPONENT AND PCA2 IS THE SECOND
# HUE WILL BE ACCORDING TO LABEL
visualization = pd.DataFrame({'PCA1': X_pca[:, 0], 'PCA2': X_pca[:, 1], 'PROP_SUBGENRE': labels})
visualization.head()


# PLOTTING COMPONENTS BY PROPAGATED SUBGENRE -- TYPE, SIZE AND TRANSPARENCY ADJUSTED FOR VISABILITY
sns.scatterplot(x='PCA1', y='PCA2', hue='PROP_SUBGENRE', palette='viridis', data=visualization, s=25, alpha=0.50)
# ADDING X MARKERS AT THE CENTERS OF EACH CLUSTER
# APPLYING THE SAME PCA TRANSFORMATION TO THESE CENTER POINTS
centers_pca = pca.transform(kmeans.cluster_centers_)
sns.scatterplot(x=centers_pca[:, 0], y=centers_pca[:, 1], marker='X', s=100, color='black')
# ADDING TITLES AND LABELS
plt.title('KMEANS CLUSTERING RESULTS - PROPAGATED SUBGENRE THREE')
plt.xlabel('PCA ONE')
plt.ylabel('PCA TWO')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol = 3)
plt.show()


# NOW, I WANT TO CREATE A PROBABALISTIC MODEL THAT PREDICTS THE LIKLIHOOD A SONG BELONGS ON A PLAYLIST BASED ON ITS MAIN GENRE, AND 3 PROPAGATED SUBGENRES
X_names =combo[['primary_artist', 'name']]
X = combo[['main_cat', 'subgenre1_prop', 'subgenre2_prop', 'subgenre3_prop']]
Y = combo[['Playlist']]


# CREATING DUMMIES FROM THE SUBGENRES
# FOR EVERY SUBGENRE REPRESENTED IN THE FRAME THERE WILL NOW BE A COLUMN THAT SAYS SUBGENRE1_PROP_...
X = pd.get_dummies(X)


# INSTEAD OF HAVING THREE SUBGENRE COLUMNS FOR EACH SUBGENRE, I WANT TO COMBINE THEM ALL INTO ONE
## SELECTING COLUMNS THAT FOLLOW THE SUBGENRE#_PROP_ STRUCTURE USING REGULAR EXPRESSIONS
selected_columns = X.filter(regex=r'^subgenre[1-3]_prop_')
for column in selected_columns:
    # CREATING A STRING WITH WHATEVER OCCURS AFTER PROP_
    ## [2] SELECTS THE ENTIRE PHRASE AND NOT JUST PORTIONS
    prop_part = column.split("_")[2]
    # CREATING A NEW COLUMN WHERE IT IS TRUE IF THIS WAS ONE OF THE SUBGENRES FOR THE SONGS
    X[prop_part] = X[column].fillna('')
# DROPPING THE ORIGINAL COLUMNS THAT INCLUDED SUBGENRE_PROP_
X = X.drop(selected_columns, axis=1)
X.head()


# OUR TRAINING SET FOR THE LAST MODEL WILL BE ALL SONGS THAT HAVE A PLAYLIST LABEL
valid_rows = ~pd.isna(Y['Playlist'])
Y_train = Y[valid_rows]
X_train = X[valid_rows]


# FOR EASE OF COMPUTING AND INTERPRETABILITY, WE ARE ONLY GOING TO ADD SONGS TO THE TOP 50 PLAYLISTS REPRESENTED IN THE DATASET
value_counts = Y_train['Playlist'].value_counts()
# COUNT >82 GIVES US EXACTLY 50 PLAYLISTS
subset = value_counts[value_counts > 82].index
Y_train = Y_train[Y_train['Playlist'].isin(subset)]
X_train = X_train.loc[Y_train.index]


# creating a dictionary that we can add to later
playlist_dictionary = {}
# for all of the playlists present in the limited training frame
for playlist in Y_train['Playlist'].unique():
    # the key is the playlist and the values are the songs belonging to that playlist
    current_playlist = {'Playlist': playlist, 'Songs': Y_train.index[Y_train['Playlist'] == playlist]}
    playlist_dictionary[playlist] = current_playlist
# displaying one of the playlist, song associations
playlist_dictionary['80s Soft Rock']

{'Playlist': '80s Soft Rock',
 'Songs': Int64Index([13944, 14074, 14160, 14213, 14232, 14324, 14385, 14386, 14476,
             14492, 14682, 14848, 14849, 14889, 15305, 15660, 15850, 16002,
             16009, 16084, 16217, 16228, 16378, 16391, 16403, 16498, 16519,
             16655, 16668, 16690, 16759, 16981, 17046, 17141, 17216, 17261,
             17288, 17428, 17452, 17626, 17648, 17691, 17885, 17923, 17972,
             18130, 18191, 18193, 18523, 18561, 18580, 18633, 18805, 18878,
             18885, 18887, 18933, 18941, 18998, 19082, 19135, 19369, 19538,
             19791, 20214, 20307, 20339, 20413, 20581, 20628, 20709, 20856,
             20931, 20994, 21120, 21126, 21263, 21479, 21761, 21846, 21907,
             21908, 21975, 22260, 22300, 22307, 22408],
            dtype='int64')}


# training a logistic regression model to predict playlist names
model = LogisticRegression()
# train on all songs with a named playlist in the top 50 playlists represented
model.fit(X_train, Y_train)

/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

LogisticRegression()

LogisticRegression()


# limiting the test / prediction set to be all songs that do not have a playlist
valid_rows = pd.isna(Y['Playlist'])
Y_test = Y[valid_rows]
X_test= X[valid_rows]


# my computer crashes at all 1.1 million samples, so I am only predicting playlist names for 200k songs
X_test_sample = X_test.sample(n=200000)
Y_pred = model.predict(X_test_sample)


# one benefit to the logistic regression model is that you can see the liklihood that each prediction belonged to each of the classification options
#grab the probabilities that each song belongs to one of the 50 playlists
probabilities = model.predict_proba(X_test_sample)
# create the dataframe that displays each song's probability of being in particular playlist
probabilities_df = pd.DataFrame(probabilities, columns=model.classes_)
probabilities_df


# we will now create a dictionary that stores the five songs with the highest probability of being in that playlist
top_songs_dict = {}
# Loop through each column (playlist) in the DataFrame
for playlist in probabilities_df.columns:
    # Get the five top song based on largest probabilities
    top_song_indices = probabilities_df[playlist].nlargest(5).index[0:].to_list()
    # Store the top song indices in the dictionary
    top_songs_dict[playlist] = top_song_indices


# for the top 50 playlists, recommend one song out of the top five predicted
for playlist in playlist_dictionary:
  # the index can be 0,1,2,3,4 and will select one of the top five songs
  random_index = random.randint(0, 4)
  print(f"If you like {playlist} then you may like {X_names.iloc[top_songs_dict[playlist][random_index]]['name']} by {X_names.iloc[top_songs_dict[playlist][random_index]]['primary_artist']} \n ")

If you like Mellow Bars then you may like Birds Are Chirping by D-Block Europe 
 
If you like Jazz Rap then you may like The Jackson Song by Patti Smith 
 
If you like tear drop then you may like Never Separate - A Song For Friends by Vickie Winans 
 
If you like I Love My '90s Hip-Hop then you may like Before It All Ends by kent 
 
If you like Lo-fi Indie then you may like Angel Witch by Angel Witch 
 
If you like Ethereal then you may like Ride Again by The .357 STring Band 
 
If you like vaporwave then you may like Jonella And Jack by Johnny Otis 
 
If you like Future Funk then you may like Bionic Muscle by Truffel the Phunky Phaqir 
 
If you like Fresh Finds: Indie then you may like Love's In Need Of Love Today by Stevie Wonder 
 
If you like pulp then you may like Strange Fruit by Billie Holiday 
 
If you like Indie Instrumental then you may like All or None by Pearl Jam 
 
If you like Retrowave then you may like Enough by Surfing 
 
If you like Shoegaze Classics then you may like The King Who Wouldn't Smile by The Handsome Family 
 
If you like Fresh Finds: Experimental then you may like Antisocial by Patient Sixty-Seven 
 
If you like Altar then you may like Solid (feat. Drake) by Young Stoner Life 
 
If you like Noisy then you may like Enemies (feat. DaBaby) by Post Malone 
 
If you like Modern Psychedelia then you may like colorblind (kina remix) by Mokita 
 
If you like Northern Spirits then you may like 5 Below by Evie Ladin Band 
 
If you like Thrashers then you may like כל החברים שלך by Anna Zak 
 
If you like Metal Covers then you may like Body (Remix) [feat. ArrDee, E1 (3x3), ZT (3x3), Bugzy Malone, Buni, Fivio Foreign & Darkoo] by Tion Wayne 
 
If you like New Blood then you may like Dumpers by Flowdan 
 
If you like Hard Rock then you may like סוף השבוע by Eliad 
 
If you like Instrumental Madness then you may like Dumpers by Flowdan 
 
If you like Stoner Rock then you may like Creulty by Versiple 
 
If you like Deep Dive - 90s Metal then you may like Dumpers by Flowdan 
 
If you like Progressive Metal then you may like Hustle Hard Remix by Ace Hood 
 
If you like Black Metal Essentials then you may like They Don't Want What We Want (And They Don't Care) by Asking Alexandria 
 
If you like Black & Dark Metal then you may like Introduction by Kris Kristofferson - Live at Madison Square Garden, New York, NY - October 1992 by Kris Kristofferson 
 
If you like Got Djent? then you may like Zebra by Cal in Red 
 
If you like Heavy Metal then you may like Lose Yourself by Eminem 
 
If you like Old School Metal then you may like הטוב הרע ואחותך by Tuna 
 
If you like Heavy Queens then you may like Deeper Love by Mary Beth Maziarz 
 
If you like Metal Ballads then you may like 5 Below by Evie Ladin Band 
 
If you like Industrial Metal then you may like Lose Yourself by Eminem 
 
If you like Deathcore then you may like Cymbol by If Thousands 
 
If you like Deep Dive - 00s Metal then you may like סוף השבוע by Eliad 
 
If you like Acoustic Hits then you may like Jobs by City Girls 
 
If you like Hot Acoustics then you may like anything by Adrianne Lenker 
 
If you like Retro Pop then you may like I Wish by Skee-Lo 
 
If you like Soft Pop Hits then you may like Operation Big Beat by From Bubblegum To Sky 
 
If you like Global X then you may like And Then What by Jeezy 
 
If you like The Lot then you may like Still Counting by Volbeat 
 
If you like 10s Pop Rock then you may like Bad Boy (with Young Thug) by Juice WRLD 
 
If you like Woodstock then you may like Alone (feat. Lil Durk) by 42 Dugg 
 
If you like Deep Dive: 80s Rock then you may like זן נדיר by Korin Allal 
 
If you like Prog Rock Monsters then you may like Twister by Stephen Vitiello 
 
If you like 80s Soft Rock then you may like Maggie by Tanglefoot 
 
If you like Rocktronic then you may like Scared by Kembe X 
 
If you like Fresh Finds: Rock then you may like QUESO by AG Club 
 
If you like Beach Vibes then you may like True Indeed by Busta Rhymes


# Converting my CoLab to html so that I can upload this file to Github
!jupyter nbconvert --to html /content/drive/MyDrive/spotify_finaltutorial.ipynb

[NbConvertApp] Converting notebook /content/drive/MyDrive/spotify_finaltutorial.ipynb to html
[NbConvertApp] Writing 3089218 bytes to /content/drive/MyDrive/spotify_finaltutorial.html

	name	album	explicit	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	time_signature	year	release_date	primary_artist
0	Testify	The Battle Of Los Angeles	False	0.470	0.978	7	-5.399	1	0.0727	0.02610	0.000011	0.3560	0.503	117.906	210133	4.0	1999	1999-11-02	Rage Against The Machine
1	Guerrilla Radio	The Battle Of Los Angeles	True	0.599	0.957	11	-5.764	1	0.1880	0.01290	0.000071	0.1550	0.489	103.680	206200	4.0	1999	1999-11-02	Rage Against The Machine
2	Calm Like a Bomb	The Battle Of Los Angeles	False	0.315	0.970	7	-5.424	1	0.4830	0.02340	0.000002	0.1220	0.370	149.749	298893	4.0	1999	1999-11-02	Rage Against The Machine
3	Mic Check	The Battle Of Los Angeles	True	0.440	0.967	11	-5.830	0	0.2370	0.16300	0.000004	0.1210	0.574	96.752	213640	4.0	1999	1999-11-02	Rage Against The Machine
4	Sleep Now In the Fire	The Battle Of Los Angeles	False	0.426	0.929	2	-6.729	1	0.0701	0.00162	0.105000	0.0789	0.539	127.059	205600	4.0	1999	1999-11-02	Rage Against The Machine

	Artist Name	Track Name	Popularity	Playlist	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	main_cat	subgenre1	subgenre2	subgenre3
0	21 Savage	Spiral	63	No Cap	0.814	0.659	2	-4.475	1	0.0829	0.00155	0.025100	0.568	0.130	88.506	171527	hip hop	atl hip hop	rap	trap
1	VIC MENSA	SHELTER ft Wyclef Jean, ft Chance The Rapper	55	Mellow Bars	0.664	0.660	0	-5.284	1	0.2910	0.18100	0.000002	0.190	0.470	90.106	261467	hip hop	chicago rap	conscious hip hop	hip hop
2	Pooh Shiesty	Welcome To The Riches (feat. Lil Baby)	0	No Cap	0.842	0.400	11	-11.308	0	0.4860	0.04570	0.000000	0.172	0.205	130.018	192052	hip hop	memphis hip hop	rap	southern hip hop
3	Athletic Progression	Stepney Tale	37	Jazz Rap	0.632	0.800	5	-7.227	0	0.2340	0.54700	0.000000	0.147	0.496	92.757	209833	hip hop	aarhus indie	danish modern jazz	jazz rap
4	Ghetts	Fire and Brimstone	46	Grime Shutdown	0.846	0.511	1	-8.116	1	0.2630	0.03750	0.000005	0.147	0.346	136.964	190773	hip hop	grime	uk alternative hip hop	uk hip hop

	primary_artist	name	Playlist	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	main_cat	subgenre1	subgenre2	subgenre3
0	21 Savage	Spiral	No Cap	0.814	0.659	2	-4.475	1	0.0829	0.00155	0.025100	0.568	0.130	88.506	171527	hip hop	atl hip hop	rap	trap
1	VIC MENSA	SHELTER ft Wyclef Jean, ft Chance The Rapper	Mellow Bars	0.664	0.660	0	-5.284	1	0.2910	0.18100	0.000002	0.190	0.470	90.106	261467	hip hop	chicago rap	conscious hip hop	hip hop
2	Pooh Shiesty	Welcome To The Riches (feat. Lil Baby)	No Cap	0.842	0.400	11	-11.308	0	0.4860	0.04570	0.000000	0.172	0.205	130.018	192052	hip hop	memphis hip hop	rap	southern hip hop
3	Athletic Progression	Stepney Tale	Jazz Rap	0.632	0.800	5	-7.227	0	0.2340	0.54700	0.000000	0.147	0.496	92.757	209833	hip hop	aarhus indie	danish modern jazz	jazz rap
4	Ghetts	Fire and Brimstone	Grime Shutdown	0.846	0.511	1	-8.116	1	0.2630	0.03750	0.000005	0.147	0.346	136.964	190773	hip hop	grime	uk alternative hip hop	uk hip hop

	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo
count	16503.000000	16503.000000	16503.000000	16503.000000	16503.000000	16503.000000	16503.000000	16503.000000	16503.000000	16503.000000	16503.000000
mean	0.549131	0.703038	5.294007	-7.217833	0.609101	0.089668	0.183417	0.119798	0.198907	0.475726	123.590411
std	0.171404	0.210986	3.572691	3.293898	0.487967	0.093402	0.261125	0.257436	0.157356	0.235877	29.367800
min	0.000000	0.000020	0.000000	-34.825000	0.000000	0.000000	0.000000	0.000000	0.011900	0.000000	0.000000
25%	0.432000	0.562000	2.000000	-8.788500	0.000000	0.035800	0.001950	0.000000	0.097300	0.291000	99.996000
50%	0.547000	0.734000	5.000000	-6.604000	1.000000	0.051500	0.044200	0.000227	0.133000	0.466000	121.946000
75%	0.671000	0.883000	9.000000	-4.953000	1.000000	0.097550	0.277000	0.040100	0.267000	0.656000	142.860500
max	0.989000	1.000000	11.000000	1.355000	1.000000	0.960000	0.996000	0.996000	0.992000	0.986000	249.438000

	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo
main_cat
hip hop	0.725896	0.643251	5.398239	-7.492763	0.531800	0.233713	0.197276	0.028317	0.200384	0.545804	118.718808
indie	0.554327	0.656076	5.378719	-8.197993	0.655320	0.067774	0.224543	0.229955	0.186843	0.488513	123.396614
metal	0.411755	0.876887	5.257905	-5.879572	0.584000	0.088527	0.028511	0.213043	0.224401	0.322053	128.244486
pop	0.636119	0.603041	5.193043	-7.039221	0.526376	0.072547	0.328205	0.021150	0.167847	0.521876	119.776471
rock	0.503999	0.721748	5.262985	-7.174320	0.657919	0.059603	0.155368	0.081397	0.209171	0.492442	125.128764

Establishing a Viable Workspace¶

MOTIVATIONS & GOALS¶

DATA COLLECTION¶

DATASET ONE: 1.2 MILLION UNLABELED TRACKS¶

DATASET TWO: MULTI GENRE MUSIC WITH PLAYLIST TITLES¶

DATA PROCESSING¶

EXPLORATORY DATA ANALYSIS¶

MACHINE LEARNING MODEL¶

STEP ONE - GENRE PROPAGATION¶

GRAPHICAL REPRESENTATION OF CLUSTERING MODEL¶

STEP TWO - SUBGENRE PROPOGATION (THREE ITERATIONS)¶

SUBGENRE ONE¶

GRAPHICAL REPRESENTATION OF CLUSTERING MODEL -- SUBGENRE ONE¶

SUBGENRE TWO:¶

GRAPHICAL REPRESENTATION OF CLUSTERING MODEL -- SUBGENRE TWO¶

SUBGENRE THREE:¶

GRAPHICAL REPRESENTATION OF CLUSTERING MODEL -- SUBGENRE THREE¶

STEP THREE - PLAYLIST PREDICTION AND EXPANSION¶

CONCLUSIONS & FINAL REMARKS¶

	PCA1	PCA2	PROP_GENRE
0	-28.849928	-8.808937	hip hop
1	-27.299423	-7.879013	hip hop
2	12.216420	0.219969	hip hop
3	-24.759572	-5.858289	hip hop
4	19.325073	-2.427352	hip hop

	PCA1	PCA2	PROP_SUBGENRE
0	-28.849928	-8.808937	britpop
1	-27.299423	-7.879013	britpop
2	12.216420	0.219969	escape room
3	-24.759572	-5.858289	electropop
4	19.325073	-2.427352	melodic hardcore

	PCA1	PCA2	PROP_SUBGENRE
0	-28.849928	-8.808937	detroit trap
1	-27.299423	-7.879013	detroit trap
2	12.216420	0.219969	uk house
3	-24.759572	-5.858289	detroit trap
4	19.325073	-2.427352	progressive doom

	PCA1	PCA2	PROP_SUBGENRE
0	-28.849928	-8.808937	mod revival
1	-27.299423	-7.879013	mod revival
2	12.216420	0.219969	hard rock
3	-24.759572	-5.858289	post-teen pop
4	19.325073	-2.427352	hurdy-gurdy

	main_cat_hip hop	...	mod revival	post-teen pop
0	1	...	1	0
1	1	...	1	0
2	1	...	0	0
3	1	...	0	1
4	1	...	0	0

	10s Pop Rock	80s Soft Rock	Acoustic Hits	Altar	Beach Vibes	Black & Dark Metal	Black Metal Essentials	Deathcore	Deep Dive - 00s Metal	Deep Dive - 90s Metal	...	Rocktronic	Shoegaze Classics	Soft Pop Hits	Stoner Rock	The Lot	Thrashers	Woodstock	pulp	tear drop	vaporwave
0	0.051207	0.291030	0.001040	0.000359	0.058965	0.000620	0.000773	0.000690	0.000691	0.000661	...	0.047921	0.001539	0.000971	0.000884	0.083552	0.000550	0.256982	0.000637	0.001113	0.000553
1	0.000305	0.000840	0.000466	0.031933	0.000192	0.000283	0.001892	0.001314	0.001687	0.000290	...	0.000298	0.013194	0.001761	0.000625	0.001727	0.000164	0.001275	0.023309	0.001405	0.011339
2	0.000480	0.001626	0.260261	0.000192	0.003343	0.001564	0.001258	0.000309	0.000913	0.000806	...	0.000410	0.000921	0.170773	0.001518	0.000263	0.000175	0.000402	0.000227	0.000589	0.000663
3	0.000757	0.002504	0.344866	0.000177	0.001791	0.000728	0.001138	0.000583	0.000350	0.000476	...	0.001546	0.001555	0.217848	0.001397	0.000859	0.000711	0.000274	0.001853	0.001114	0.001636
4	0.192785	0.097623	0.001109	0.000157	0.138664	0.000247	0.000552	0.001090	0.000777	0.001634	...	0.153260	0.000695	0.000895	0.000652	0.124255	0.002152	0.032819	0.000646	0.000246	0.002552
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
199995	0.000736	0.001617	0.000972	0.037113	0.000378	0.001127	0.000742	0.001233	0.001083	0.001602	...	0.000740	0.045242	0.001524	0.000177	0.002212	0.001029	0.000877	0.055463	0.001409	0.057281
199996	0.000859	0.000391	0.001161	0.066156	0.001315	0.000262	0.001224	0.000920	0.001139	0.001005	...	0.002023	0.071069	0.000923	0.001716	0.000432	0.000582	0.000657	0.113929	0.001548	0.084789
199997	0.001026	0.000172	0.000967	0.076222	0.002216	0.000709	0.001312	0.000563	0.001428	0.001807	...	0.002900	0.097423	0.001830	0.001365	0.000422	0.000481	0.000772	0.066291	0.002480	0.077381
199998	0.000859	0.000391	0.001161	0.066156	0.001315	0.000262	0.001224	0.000920	0.001139	0.001005	...	0.002023	0.071069	0.000923	0.001716	0.000432	0.000582	0.000657	0.113929	0.001548	0.084789
199999	0.000736	0.001617	0.000972	0.037113	0.000378	0.001127	0.000742	0.001233	0.001083	0.001602	...	0.000740	0.045242	0.001524	0.000177	0.002212	0.001029	0.000877	0.055463	0.001409	0.057281

	main_cat_hip hop	...	mod revival	post-teen pop
0	1	...	1	0
1	1	...	1	0
2	1	...	0	0
3	1	...	0	1
4	1	...	0	0

	main_cat_hip hop	...	mod revival	post-teen pop
0	1	...	1	0
1	1	...	1	0
2	1	...	0	0
3	1	...	0	1
4	1	...	0	0