import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

songs_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'

spotify_df = pd.read_csv( songs_url )

print(spotify_df.shape) 
print(spotify_df.columns)

(32833, 24)
Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_date',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'popularity_category'],
      dtype='object')

print(spotify_df.dtypes)

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_id                  object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
popularity_category          object
dtype: object

spotify_df.isna().sum()

track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64

spotify_df.nunique()

track_id                    28356
track_name                  23449
track_artist                10692
track_popularity              101
track_album_id              22545
track_album_name            19743
track_album_release_date     4530
playlist_name                 449
playlist_id                   471
playlist_genre                  6
playlist_subgenre              24
danceability                  822
energy                        952
key                            12
loudness                    10222
mode                            2
speechiness                  1270
acousticness                 3731
instrumentalness             4729
liveness                     1624
valence                      1362
tempo                       17684
duration_ms                 19785
dtype: int64

continuous_vars = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
                   'liveness', 'valence', 'tempo', 'duration_ms']

# Melt the dataframe to long format for faceting
df_melted = pd.melt(spotify_df[continuous_vars], var_name='Variable', value_name='Value')

# Create a faceted histogram
g = sns.FacetGrid(df_melted, col='Variable', col_wrap=4, sharex=False, sharey=False)
g.map(sns.histplot, 'Value', bins=30)

# Adjust layout and titles
g.set_titles('{col_name}')
g.set_axis_labels('Value', 'Count')
plt.tight_layout()

# Show the plot
plt.show()

g = sns.FacetGrid(df_melted, col='Variable', col_wrap=4, sharex=False, sharey=False)
g.map(sns.kdeplot, 'Value')

# Adjust layout and titles
g.set_titles('{col_name}')
g.set_axis_labels('Value', 'Density')
plt.tight_layout()

# Show the plot
plt.show()

sns.set_style("whitegrid")

# Calculate the median of track_popularity
median_popularity = spotify_df['track_popularity'].median()

# Create a new column for high/low categories based on median
spotify_df['popularity_category'] = spotify_df['track_popularity'].apply(
    lambda x: 'High' if x >= median_popularity else 'Low'
)


fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10), sharey=False)


sns.histplot(data=spotify_df, x='track_popularity', bins=30, ax=ax1)
ax1.set_title('Distribution of Track Popularity')
ax1.set_xlabel('Track Popularity')
ax1.set_ylabel('Count')


sns.histplot(data=spotify_df, x='track_popularity', hue='popularity_category', 
             bins=30, multiple='stack', ax=ax2)
ax2.set_title('Track Popularity by High/Low Category (Median Split)')
ax2.set_xlabel('Track Popularity')
ax2.set_ylabel('Count')


sns.histplot(data=spotify_df, x='popularity_category', ax=ax3, discrete=True)
ax3.set_title('Count of Tracks by Popularity Category')
ax3.set_xlabel('Popularity Category')
ax3.set_ylabel('Count')


sns.histplot(data=spotify_df, x='track_popularity', bins=30, cumulative=True, 
             stat='count', fill=False, element='step', ax=ax4)
ax4.set_title('Cumulative Distribution of Track Popularity')
ax4.set_xlabel('Track Popularity')
ax4.set_ylabel('Cumulative Count')
# Adjust layout
plt.tight_layout()
plt.show()

sns.set_style("whitegrid")


fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5), sharey=True)


sns.histplot(data=spotify_df, x='key', ax=ax1, discrete=True)
ax1.set_title('Distribution of Key')
ax1.set_xlabel('Key')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)


sns.histplot(data=spotify_df, x='playlist_genre', ax=ax2, discrete=True)
ax2.set_title('Distribution of Playlist Genre')
ax2.set_xlabel('Playlist Genre')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=45)

sns.histplot(data=spotify_df, x='mode', ax=ax3, discrete=True)
ax3.set_title('Distribution of Mode')
ax3.set_xlabel('Mode')
ax3.set_ylabel('Count')
ax3.tick_params(axis='x', rotation=45)


plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(spotify_df[['track_popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
                   'liveness', 'valence', 'tempo', 'duration_ms']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

sns.regplot(x='instrumentalness', y='track_popularity', data=spotify_df, color='blue', 
            scatter_kws={'alpha': 0.6}, line_kws={'color': 'red'})


plt.title('Relationship Between instrumentalness and track_popularity')
plt.xlabel('Instrumentalness')
plt.ylabel('Track Popularity')


plt.show()


sns.regplot(x='duration_ms', y='track_popularity', data=spotify_df, color='blue', 
            scatter_kws={'alpha': 0.6}, line_kws={'color': 'red'})


plt.title('Relationship Between duration_ms and track_popularity')
plt.xlabel('Duration (MS)')
plt.ylabel('Track Popularity')


plt.show()

plt.figure(figsize=(12, 6))  


plt.subplot(1, 2, 1) 
sns.boxplot(x='key', y='track_popularity', hue='key', data=spotify_df, palette='muted', legend=False)
plt.title('track_popularity by Musical Key')
plt.xlabel('Key')
plt.ylabel('track_popularity')


plt.subplot(1, 2, 2)  
sns.boxplot(x='playlist_genre', y='track_popularity', hue='playlist_genre', data=spotify_df, palette='muted', legend=False)
plt.title('track_popularity by Playlist Genre')
plt.xlabel('Playlist Genre')
plt.ylabel('track_popularity')


plt.tight_layout()


plt.show()

plt.figure(figsize=(12, 6))  

sns.boxplot(x='mode', y='track_popularity', hue='mode', data=spotify_df, palette='muted', legend=False)
plt.title('track_popularity by Mode')
plt.xlabel('Mode')
plt.ylabel('track_popularity')

Text(0, 0.5, 'track_popularity')

plt.figure(figsize=(12, 6))  


plt.subplot(1, 2, 1) 
sns.boxplot(x='key', y='speechiness', hue='key', data=spotify_df, palette='muted', legend=False)
plt.title('speechiness by Musical Key')
plt.xlabel('Key')
plt.ylabel('speechiness')


plt.subplot(1, 2, 2)  
sns.boxplot(x='playlist_genre', y='speechiness', hue='playlist_genre', data=spotify_df, palette='muted', legend=False)
plt.title('speechiness by Playlist Genre')
plt.xlabel('Playlist Genre')
plt.ylabel('speechiness')


plt.tight_layout()


plt.show()

plt.figure(figsize=(10, 12))  


plt.subplot(2, 1, 1) 
sns.violinplot(data=spotify_df, y='track_popularity', x='playlist_genre', hue='mode', palette='muted')
plt.title('track_popularity by Playlist Genre (Violin Plot)')
plt.xlabel('Playlist Genre')
plt.ylabel('track_popularity')


plt.subplot(2, 1, 2)  
sns.boxplot(data=spotify_df, y='track_popularity', x='playlist_genre', hue='mode', palette='muted', legend = False)
plt.title('track_popularity by Playlist Genre (Boxplot)')
plt.xlabel('Playlist Genre')
plt.ylabel('track_popularity')


plt.tight_layout()


plt.show()

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy

continuous_vars = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
                   'liveness', 'valence', 'tempo', 'duration_ms']

spotify_df_clean = spotify_df.copy()

spotify_df_clean =  spotify_df_clean[continuous_vars]

sns.catplot(data = spotify_df_clean, kind='box', aspect=2)

plt.show()

X = StandardScaler().fit_transform(spotify_df_clean)

sns.catplot(data = pd.DataFrame(X, columns = spotify_df_clean.columns), kind='box', aspect=2)

plt.show()

clusters_2 = KMeans(n_clusters=2, random_state=100, n_init=25, max_iter=500).fit_predict( X )

C:\Users\joshu\anaconda3\envs\cmpinf2100\lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
[WinError 2] The system cannot find the file specified
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
  warnings.warn(
  File "C:\Users\joshu\anaconda3\envs\cmpinf2100\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\joshu\anaconda3\envs\cmpinf2100\lib\subprocess.py", line 493, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\joshu\anaconda3\envs\cmpinf2100\lib\subprocess.py", line 858, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\joshu\anaconda3\envs\cmpinf2100\lib\subprocess.py", line 1327, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,

spotify_cluster  = spotify_df_clean.copy()

spotify_cluster['k2'] = pd.Series(clusters_2, index=spotify_cluster.index ).astype('category')

spotify_cluster.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   danceability      32833 non-null  float64 
 1   energy            32833 non-null  float64 
 2   loudness          32833 non-null  float64 
 3   speechiness       32833 non-null  float64 
 4   acousticness      32833 non-null  float64 
 5   instrumentalness  32833 non-null  float64 
 6   liveness          32833 non-null  float64 
 7   valence           32833 non-null  float64 
 8   tempo             32833 non-null  float64 
 9   duration_ms       32833 non-null  int64   
 10  k2                32833 non-null  category
dtypes: category(1), float64(9), int64(1)
memory usage: 2.5 MB

spotify_cluster.k2.value_counts()

k2
1    23149
0     9684
Name: count, dtype: int64

tots_within = []

K = range(1, 31)

for k in K:
    km = KMeans(n_clusters=k, random_state=100, n_init=25, max_iter=500)
    km = km.fit( X )
    
    tots_within.append( km.inertia_ )

fig, ax = plt.subplots()

ax.plot( K, tots_within, 'bo-' )
ax.set_xlabel('number of clusters')
ax.set_ylabel('total within sum of squares')

plt.show()

spotify_cluster  = spotify_df_clean.copy()

clusters_6 = KMeans(n_clusters=6, random_state=100, n_init=25, max_iter=500).fit_predict( X )
spotify_cluster['k6'] = pd.Series( clusters_6, index=spotify_cluster.index ).astype('category')

cluster_summary = spotify_cluster.groupby('k6')[[
    'danceability', 'energy', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 
    'valence', 'tempo', 'duration_ms'
]].mean().round(3)

cluster_sizes = spotify_cluster['k6'].value_counts().sort_index()
cluster_summary['count'] = cluster_sizes
cluster_summary

spotify_cluster.k6.value_counts()

k6
4    10768
1     8463
5     4682
3     4322
0     2470
2     2128
Name: count, dtype: int64

sns.catplot(data=spotify_cluster, x='k6', kind='count')
plt.show()

hclust_ward = hierarchy.ward(spotify_cluster)

fig = plt.figure(figsize=(12, 6))

dn = hierarchy.dendrogram(hclust_ward, no_labels=True )

plt.show()

spotify_cluster['hclust_3'] = pd.Series(hierarchy.cut_tree(hclust_ward, n_clusters=3).ravel(),
                                     index=spotify_cluster.index ).astype('category')

spotify_cluster.hclust_3.value_counts()

hclust_3
1    16628
0    12217
2     3988
Name: count, dtype: int64

sns.catplot(data=spotify_cluster, x='hclust_3', kind='count')
plt.show()

import statsmodels.formula.api as smf

def my_coefplot(mod, figsize_use=(10, 4)):
    """
    This is a function that takes a fitted model and graphs all regression coefficient
    values, along with an error bar based on the standard error.
    """
    fig, ax = plt.subplots(figsize=figsize_use)
    
    ax.errorbar(y=mod.params.index,
                x=mod.params,
                xerr = 2 * mod.bse,
                fmt='o', color='k', ecolor='k', elinewidth=2, ms=10)
    
    ax.axvline(x=0, linestyle='--', linewidth=3.5, color='grey')
    ax.set_xlabel('coefficient value')
    
    plt.show()

def my_coefplot2(mod, figsize_use=(10, 15)):
    """
    This is a function that takes a fitted model and graphs all regression coefficient
    values, along with an error bar based on the standard error.
    """
    fig, ax = plt.subplots(figsize=figsize_use)
    
    ax.errorbar(y=mod.params.index,
                x=mod.params,
                xerr = 2 * mod.bse,
                fmt='o', color='k', ecolor='k', elinewidth=2, ms=10)
    
    ax.axvline(x=0, linestyle='--', linewidth=3.5, color='grey')
    ax.set_xlabel('coefficient value')
    
    plt.show()

spotify_model = spotify_df.copy()

spotify_model[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
                   'liveness', 'valence', 'tempo', 'duration_ms', 'track_popularity']] = StandardScaler().fit_transform(spotify_model[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
                   'liveness', 'valence', 'tempo', 'duration_ms', 'track_popularity']])

fit_m1 = smf.ols(formula='track_popularity ~ 1', data=spotify_model).fit()

print(fit_m1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:       track_popularity   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                       nan
Date:                Mon, 21 Apr 2025   Prob (F-statistic):                nan
Time:                        19:18:40   Log-Likelihood:                -46588.
No. Observations:               32833   AIC:                         9.318e+04
Df Residuals:                   32832   BIC:                         9.319e+04
Df Model:                           0                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1.388e-16      0.006  -2.51e-14      1.000      -0.011       0.011
==============================================================================
Omnibus:                     5603.060   Durbin-Watson:                   1.054
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1488.251
Skew:                          -0.233   Prob(JB):                         0.00
Kurtosis:                       2.067   Cond. No.                         1.00
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

fit_m1.params

Intercept   -1.387779e-16
dtype: float64

fit_m1.pvalues < 0.05

Intercept    False
dtype: bool

my_coefplot(fit_m1)

fit_m2 = smf.ols('track_popularity ~ C(playlist_genre) + C(key) + C(mode)', data=spotify_model).fit()

# Summary
print("Model 2: Categorical Inputs")
print(fit_m2.summary())

Model 2: Categorical Inputs
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       track_popularity   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     63.45
Date:                Mon, 21 Apr 2025   Prob (F-statistic):          5.96e-215
Time:                        19:18:41   Log-Likelihood:                -46057.
No. Observations:               32833   AIC:                         9.215e+04
Df Residuals:                   32815   BIC:                         9.230e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
==============================================================================================
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                     -0.3012      0.022    -13.748      0.000      -0.344      -0.258
C(playlist_genre)[T.latin]     0.4882      0.019     26.134      0.000       0.452       0.525
C(playlist_genre)[T.pop]       0.5164      0.018     28.115      0.000       0.480       0.552
C(playlist_genre)[T.r&b]       0.2554      0.018     13.873      0.000       0.219       0.291
C(playlist_genre)[T.rap]       0.3334      0.018     18.348      0.000       0.298       0.369
C(playlist_genre)[T.rock]      0.2819      0.019     14.765      0.000       0.244       0.319
C(key)[T.1]                    0.0092      0.023      0.401      0.688      -0.036       0.054
C(key)[T.2]                   -0.0414      0.025     -1.659      0.097      -0.090       0.008
C(key)[T.3]                   -0.0490      0.037     -1.331      0.183      -0.121       0.023
C(key)[T.4]                   -0.0277      0.027     -1.020      0.308      -0.081       0.026
C(key)[T.5]                    0.0019      0.026      0.073      0.942      -0.048       0.052
C(key)[T.6]                   -0.0021      0.026     -0.081      0.936      -0.052       0.048
C(key)[T.7]                   -0.0744      0.024     -3.117      0.002      -0.121      -0.028
C(key)[T.8]                    0.0726      0.026      2.783      0.005       0.021       0.124
C(key)[T.9]                   -0.0349      0.025     -1.415      0.157      -0.083       0.013
C(key)[T.10]                   0.0034      0.027      0.125      0.901      -0.050       0.056
C(key)[T.11]                  -0.0114      0.025     -0.458      0.647      -0.060       0.037
C(mode)[T.1]                   0.0119      0.012      1.026      0.305      -0.011       0.035
==============================================================================
Omnibus:                     3839.611   Durbin-Watson:                   1.089
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1368.916
Skew:                          -0.267   Prob(JB):                    5.54e-298
Kurtosis:                       2.155   Cond. No.                         14.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

fit_m2.params

Intercept                    -0.301170
C(playlist_genre)[T.latin]    0.488215
C(playlist_genre)[T.pop]      0.516387
C(playlist_genre)[T.r&b]      0.255383
C(playlist_genre)[T.rap]      0.333444
C(playlist_genre)[T.rock]     0.281866
C(key)[T.1]                   0.009205
C(key)[T.2]                  -0.041449
C(key)[T.3]                  -0.048954
C(key)[T.4]                  -0.027712
C(key)[T.5]                   0.001872
C(key)[T.6]                  -0.002062
C(key)[T.7]                  -0.074406
C(key)[T.8]                   0.072641
C(key)[T.9]                  -0.034858
C(key)[T.10]                  0.003360
C(key)[T.11]                 -0.011420
C(mode)[T.1]                  0.011885
dtype: float64

fit_m2.pvalues < 0.05

Intercept                      True
C(playlist_genre)[T.latin]     True
C(playlist_genre)[T.pop]       True
C(playlist_genre)[T.r&b]       True
C(playlist_genre)[T.rap]       True
C(playlist_genre)[T.rock]      True
C(key)[T.1]                   False
C(key)[T.2]                   False
C(key)[T.3]                   False
C(key)[T.4]                   False
C(key)[T.5]                   False
C(key)[T.6]                   False
C(key)[T.7]                    True
C(key)[T.8]                    True
C(key)[T.9]                   False
C(key)[T.10]                  False
C(key)[T.11]                  False
C(mode)[T.1]                  False
dtype: bool

my_coefplot(fit_m2)

fit_m3 = smf.ols('track_popularity ~ danceability + energy + loudness + danceability + energy  + instrumentalness + liveness + valence + tempo + duration_ms', data=spotify_model).fit()

# Summary
print("Model 3: Continuous Inputs")
print(fit_m3.summary())

Model 3: Continuous Inputs
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       track_popularity   R-squared:                       0.071
Model:                            OLS   Adj. R-squared:                  0.070
Method:                 Least Squares   F-statistic:                     311.8
Date:                Mon, 21 Apr 2025   Prob (F-statistic):               0.00
Time:                        19:18:41   Log-Likelihood:                -45385.
No. Observations:               32833   AIC:                         9.079e+04
Df Residuals:                   32824   BIC:                         9.086e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept        -2.636e-16      0.005  -4.95e-14      1.000      -0.010       0.010
danceability         0.0187      0.006      3.157      0.002       0.007       0.030
energy              -0.2295      0.008    -29.315      0.000      -0.245      -0.214
loudness             0.1834      0.008     23.892      0.000       0.168       0.198
instrumentalness    -0.1033      0.006    -18.367      0.000      -0.114      -0.092
liveness            -0.0292      0.005     -5.372      0.000      -0.040      -0.019
valence              0.0302      0.006      5.077      0.000       0.019       0.042
tempo                0.0189      0.005      3.458      0.001       0.008       0.030
duration_ms         -0.1102      0.005    -20.330      0.000      -0.121      -0.100
==============================================================================
Omnibus:                     3839.905   Durbin-Watson:                   1.173
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1428.491
Skew:                          -0.290   Prob(JB):                    6.41e-311
Kurtosis:                       2.159   Cond. No.                         2.67
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

fit_m3.pvalues < 0.05

Intercept           False
danceability         True
energy               True
loudness             True
instrumentalness     True
liveness             True
valence              True
tempo                True
duration_ms          True
dtype: bool

my_coefplot(fit_m3)

fit_m4 = smf.ols('track_popularity ~ C(playlist_genre) + C(key) + C(mode) +  loudness + danceability + energy + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms', data=spotify_model).fit()

# Summary
print("Model 4: All Inputs")
print(fit_m4 .summary())

Model 4: All Inputs
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       track_popularity   R-squared:                       0.091
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     122.2
Date:                Mon, 21 Apr 2025   Prob (F-statistic):               0.00
Time:                        19:18:41   Log-Likelihood:                -45015.
No. Observations:               32833   AIC:                         9.009e+04
Df Residuals:                   32805   BIC:                         9.032e+04
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
==============================================================================================
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                     -0.2246      0.022    -10.202      0.000      -0.268      -0.181
C(playlist_genre)[T.latin]     0.3301      0.020     16.762      0.000       0.291       0.369
C(playlist_genre)[T.pop]       0.3876      0.019     20.493      0.000       0.351       0.425
C(playlist_genre)[T.r&b]       0.1097      0.020      5.356      0.000       0.070       0.150
C(playlist_genre)[T.rap]       0.1690      0.020      8.471      0.000       0.130       0.208
C(playlist_genre)[T.rock]      0.3807      0.021     17.894      0.000       0.339       0.422
C(key)[T.1]                    0.0109      0.022      0.490      0.624      -0.033       0.055
C(key)[T.2]                   -0.0282      0.024     -1.162      0.245      -0.076       0.019
C(key)[T.3]                   -0.0742      0.036     -2.078      0.038      -0.144      -0.004
C(key)[T.4]                   -0.0183      0.026     -0.696      0.486      -0.070       0.033
C(key)[T.5]                    0.0023      0.025      0.092      0.927      -0.046       0.051
C(key)[T.6]                    0.0163      0.025      0.656      0.512      -0.032       0.065
C(key)[T.7]                   -0.0468      0.023     -2.021      0.043      -0.092      -0.001
C(key)[T.8]                    0.0609      0.025      2.406      0.016       0.011       0.111
C(key)[T.9]                   -0.0224      0.024     -0.939      0.348      -0.069       0.024
C(key)[T.10]                   0.0222      0.026      0.849      0.396      -0.029       0.073
C(key)[T.11]                   0.0115      0.024      0.476      0.634      -0.036       0.059
C(mode)[T.1]                   0.0093      0.011      0.832      0.405      -0.013       0.031
loudness                       0.1941      0.008     25.025      0.000       0.179       0.209
danceability                   0.0603      0.007      9.120      0.000       0.047       0.073
energy                        -0.2129      0.009    -24.016      0.000      -0.230      -0.196
speechiness                   -0.0093      0.006     -1.545      0.122      -0.021       0.002
acousticness                   0.0258      0.006      4.012      0.000       0.013       0.038
instrumentalness              -0.0816      0.006    -14.063      0.000      -0.093      -0.070
liveness                      -0.0199      0.005     -3.687      0.000      -0.031      -0.009
valence                       -0.0087      0.006     -1.392      0.164      -0.021       0.004
tempo                          0.0255      0.005      4.671      0.000       0.015       0.036
duration_ms                   -0.1089      0.005    -19.897      0.000      -0.120      -0.098
==============================================================================
Omnibus:                     3186.885   Durbin-Watson:                   1.198
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1398.864
Skew:                          -0.315   Prob(JB):                    1.74e-304
Kurtosis:                       2.209   Cond. No.                         16.9
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

fit_m3.pvalues < 0.05

Intercept           False
danceability         True
energy               True
loudness             True
instrumentalness     True
liveness             True
valence              True
tempo                True
duration_ms          True
dtype: bool

my_coefplot(fit_m4)

# Fit continuous with interactions
fit_m5 = smf.ols('track_popularity ~ (loudness + danceability + energy + instrumentalness + liveness + valence + tempo + duration_ms)**2', data=spotify_model).fit()

# Summary
print("Model 5: Continuous with Interactions")
print(fit_m5.summary())

Model 5: Continuous with Interactions
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       track_popularity   R-squared:                       0.084
Model:                            OLS   Adj. R-squared:                  0.083
Method:                 Least Squares   F-statistic:                     83.48
Date:                Mon, 21 Apr 2025   Prob (F-statistic):               0.00
Time:                        19:18:41   Log-Likelihood:                -45149.
No. Observations:               32833   AIC:                         9.037e+04
Df Residuals:                   32796   BIC:                         9.068e+04
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept                        -0.0111      0.007     -1.555      0.120      -0.025       0.003
loudness                          0.1882      0.008     22.775      0.000       0.172       0.204
danceability                      0.0207      0.006      3.346      0.001       0.009       0.033
energy                           -0.2265      0.008    -27.947      0.000      -0.242      -0.211
instrumentalness                 -0.1213      0.007    -18.043      0.000      -0.135      -0.108
liveness                         -0.0253      0.006     -4.413      0.000      -0.037      -0.014
valence                           0.0266      0.006      4.376      0.000       0.015       0.039
tempo                             0.0204      0.006      3.274      0.001       0.008       0.033
duration_ms                      -0.1199      0.006    -20.110      0.000      -0.132      -0.108
loudness:danceability             0.0385      0.007      5.300      0.000       0.024       0.053
loudness:energy                  -0.0224      0.005     -4.846      0.000      -0.031      -0.013
loudness:instrumentalness        -0.0611      0.007     -9.281      0.000      -0.074      -0.048
loudness:liveness                 0.0150      0.008      1.980      0.048       0.000       0.030
loudness:valence                  0.0347      0.008      4.280      0.000       0.019       0.051
loudness:tempo                    0.0121      0.008      1.588      0.112      -0.003       0.027
loudness:duration_ms             -0.0508      0.007     -7.519      0.000      -0.064      -0.038
danceability:energy              -0.0476      0.008     -6.153      0.000      -0.063      -0.032
danceability:instrumentalness    -0.0099      0.006     -1.635      0.102      -0.022       0.002
danceability:liveness             0.0181      0.006      3.130      0.002       0.007       0.029
danceability:valence              0.0080      0.006      1.351      0.177      -0.004       0.020
danceability:tempo                0.0060      0.006      1.067      0.286      -0.005       0.017
danceability:duration_ms         -0.0275      0.005     -5.094      0.000      -0.038      -0.017
energy:instrumentalness           0.0232      0.007      3.325      0.001       0.010       0.037
energy:liveness                  -0.0127      0.008     -1.610      0.107      -0.028       0.003
energy:valence                    0.0185      0.008      2.256      0.024       0.002       0.035
energy:tempo                     -0.0110      0.008     -1.364      0.173      -0.027       0.005
energy:duration_ms                0.0233      0.007      3.275      0.001       0.009       0.037
instrumentalness:liveness         0.0004      0.006      0.062      0.951      -0.011       0.012
instrumentalness:valence         -0.0045      0.005     -0.828      0.408      -0.015       0.006
instrumentalness:tempo           -0.0067      0.007     -0.952      0.341      -0.020       0.007
instrumentalness:duration_ms     -0.0079      0.004     -1.827      0.068      -0.016       0.001
liveness:valence                  0.0030      0.006      0.491      0.623      -0.009       0.015
liveness:tempo                    0.0018      0.006      0.320      0.749      -0.009       0.013
liveness:duration_ms              0.0122      0.005      2.393      0.017       0.002       0.022
valence:tempo                     0.0047      0.006      0.739      0.460      -0.008       0.017
valence:duration_ms               0.0043      0.006      0.761      0.447      -0.007       0.016
tempo:duration_ms                -0.0002      0.006     -0.031      0.975      -0.012       0.011
==============================================================================
Omnibus:                     3553.545   Durbin-Watson:                   1.202
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1432.870
Skew:                          -0.306   Prob(JB):                         0.00
Kurtosis:                       2.180   Cond. No.                         5.82
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

fit_m5.pvalues < 0.05

Intercept                        False
loudness                          True
danceability                      True
energy                            True
instrumentalness                  True
liveness                          True
valence                           True
tempo                             True
duration_ms                       True
loudness:danceability             True
loudness:energy                   True
loudness:instrumentalness         True
loudness:liveness                 True
loudness:valence                  True
loudness:tempo                   False
loudness:duration_ms              True
danceability:energy               True
danceability:instrumentalness    False
danceability:liveness             True
danceability:valence             False
danceability:tempo               False
danceability:duration_ms          True
energy:instrumentalness           True
energy:liveness                  False
energy:valence                    True
energy:tempo                     False
energy:duration_ms                True
instrumentalness:liveness        False
instrumentalness:valence         False
instrumentalness:tempo           False
instrumentalness:duration_ms     False
liveness:valence                 False
liveness:tempo                   False
liveness:duration_ms              True
valence:tempo                    False
valence:duration_ms              False
tempo:duration_ms                False
dtype: bool

my_coefplot2(fit_m5)

fit_m6 = smf.ols('track_popularity ~ C(playlist_genre) * (loudness + danceability + energy + instrumentalness + liveness + valence + tempo + duration_ms) + C(key) + C(mode)', data=spotify_model).fit()

# Summary
print("Model 6: Categorical-Continuous Interactions")
print(fit_m6 .summary())

Model 6: Categorical-Continuous Interactions
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       track_popularity   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.105
Method:                 Least Squares   F-statistic:                     60.39
Date:                Mon, 21 Apr 2025   Prob (F-statistic):               0.00
Time:                        19:18:42   Log-Likelihood:                -44730.
No. Observations:               32833   AIC:                         8.959e+04
Df Residuals:                   32767   BIC:                         9.015e+04
Df Model:                          65                                         
Covariance Type:            nonrobust                                         
===============================================================================================================
                                                  coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
Intercept                                      -0.1388      0.024     -5.747      0.000      -0.186      -0.091
C(playlist_genre)[T.latin]                      0.2562      0.023     11.036      0.000       0.211       0.302
C(playlist_genre)[T.pop]                        0.2970      0.022     13.653      0.000       0.254       0.340
C(playlist_genre)[T.r&b]                        0.0937      0.024      3.855      0.000       0.046       0.141
C(playlist_genre)[T.rap]                        0.0559      0.022      2.498      0.012       0.012       0.100
C(playlist_genre)[T.rock]                       0.2000      0.029      7.016      0.000       0.144       0.256
C(key)[T.1]                                     0.0103      0.022      0.467      0.640      -0.033       0.054
C(key)[T.2]                                    -0.0255      0.024     -1.059      0.290      -0.073       0.022
C(key)[T.3]                                    -0.0663      0.035     -1.871      0.061      -0.136       0.003
C(key)[T.4]                                    -0.0080      0.026     -0.304      0.761      -0.059       0.043
C(key)[T.5]                                     0.0054      0.025      0.220      0.826      -0.043       0.054
C(key)[T.6]                                     0.0169      0.025      0.687      0.492      -0.031       0.065
C(key)[T.7]                                    -0.0417      0.023     -1.814      0.070      -0.087       0.003
C(key)[T.8]                                     0.0655      0.025      2.605      0.009       0.016       0.115
C(key)[T.9]                                    -0.0206      0.024     -0.870      0.384      -0.067       0.026
C(key)[T.10]                                    0.0290      0.026      1.118      0.264      -0.022       0.080
C(key)[T.11]                                    0.0080      0.024      0.332      0.740      -0.039       0.055
C(mode)[T.1]                                    0.0073      0.011      0.657      0.511      -0.015       0.029
loudness                                        0.0691      0.022      3.140      0.002       0.026       0.112
C(playlist_genre)[T.latin]:loudness             0.2783      0.029      9.753      0.000       0.222       0.334
C(playlist_genre)[T.pop]:loudness               0.2078      0.031      6.804      0.000       0.148       0.268
C(playlist_genre)[T.r&b]:loudness               0.1779      0.028      6.342      0.000       0.123       0.233
C(playlist_genre)[T.rap]:loudness               0.0158      0.029      0.551      0.581      -0.040       0.072
C(playlist_genre)[T.rock]:loudness             -0.0032      0.029     -0.111      0.912      -0.060       0.054
danceability                                    0.0001      0.016      0.008      0.994      -0.032       0.032
C(playlist_genre)[T.latin]:danceability         0.0165      0.025      0.666      0.506      -0.032       0.065
C(playlist_genre)[T.pop]:danceability           0.0839      0.023      3.618      0.000       0.038       0.129
C(playlist_genre)[T.r&b]:danceability           0.0310      0.022      1.396      0.163      -0.013       0.075
C(playlist_genre)[T.rap]:danceability           0.1349      0.022      6.110      0.000       0.092       0.178
C(playlist_genre)[T.rock]:danceability          0.0623      0.024      2.585      0.010       0.015       0.109
energy                                         -0.2129      0.022     -9.497      0.000      -0.257      -0.169
C(playlist_genre)[T.latin]:energy              -0.1508      0.030     -4.959      0.000      -0.210      -0.091
C(playlist_genre)[T.pop]:energy                -0.0143      0.030     -0.476      0.634      -0.073       0.045
C(playlist_genre)[T.r&b]:energy                 0.0290      0.029      1.001      0.317      -0.028       0.086
C(playlist_genre)[T.rap]:energy                 0.0770      0.031      2.519      0.012       0.017       0.137
C(playlist_genre)[T.rock]:energy                0.1226      0.030      4.032      0.000       0.063       0.182
instrumentalness                               -0.0737      0.010     -7.537      0.000      -0.093      -0.055
C(playlist_genre)[T.latin]:instrumentalness    -0.0130      0.021     -0.619      0.536      -0.054       0.028
C(playlist_genre)[T.pop]:instrumentalness      -0.0193      0.019     -0.995      0.320      -0.057       0.019
C(playlist_genre)[T.r&b]:instrumentalness       0.0067      0.026      0.259      0.796      -0.044       0.057
C(playlist_genre)[T.rap]:instrumentalness       0.0609      0.017      3.521      0.000       0.027       0.095
C(playlist_genre)[T.rock]:instrumentalness     -0.0357      0.020     -1.777      0.076      -0.075       0.004
liveness                                       -0.0102      0.011     -0.908      0.364      -0.032       0.012
C(playlist_genre)[T.latin]:liveness            -0.0049      0.018     -0.274      0.784      -0.040       0.030
C(playlist_genre)[T.pop]:liveness               0.0157      0.019      0.849      0.396      -0.021       0.052
C(playlist_genre)[T.r&b]:liveness              -0.0344      0.018     -1.892      0.059      -0.070       0.001
C(playlist_genre)[T.rap]:liveness               0.0079      0.017      0.454      0.650      -0.026       0.042
C(playlist_genre)[T.rock]:liveness             -0.0519      0.017     -3.086      0.002      -0.085      -0.019
valence                                         0.0680      0.014      4.824      0.000       0.040       0.096
C(playlist_genre)[T.latin]:valence             -0.0714      0.021     -3.400      0.001      -0.113      -0.030
C(playlist_genre)[T.pop]:valence               -0.0701      0.021     -3.327      0.001      -0.111      -0.029
C(playlist_genre)[T.r&b]:valence               -0.1752      0.021     -8.240      0.000      -0.217      -0.133
C(playlist_genre)[T.rap]:valence               -0.0939      0.020     -4.729      0.000      -0.133      -0.055
C(playlist_genre)[T.rock]:valence              -0.0839      0.022     -3.778      0.000      -0.127      -0.040
tempo                                          -0.0600      0.022     -2.713      0.007      -0.103      -0.017
C(playlist_genre)[T.latin]:tempo                0.0872      0.025      3.429      0.001       0.037       0.137
C(playlist_genre)[T.pop]:tempo                  0.0518      0.026      1.965      0.049       0.000       0.104
C(playlist_genre)[T.r&b]:tempo                  0.0921      0.025      3.642      0.000       0.043       0.142
C(playlist_genre)[T.rap]:tempo                  0.1103      0.025      4.486      0.000       0.062       0.158
C(playlist_genre)[T.rock]:tempo                 0.0825      0.026      3.176      0.001       0.032       0.133
duration_ms                                    -0.1686      0.011    -14.804      0.000      -0.191      -0.146
C(playlist_genre)[T.latin]:duration_ms          0.1196      0.020      5.984      0.000       0.080       0.159
C(playlist_genre)[T.pop]:duration_ms            0.0718      0.021      3.408      0.001       0.031       0.113
C(playlist_genre)[T.r&b]:duration_ms            0.0085      0.018      0.479      0.632      -0.026       0.043
C(playlist_genre)[T.rap]:duration_ms            0.0707      0.018      4.032      0.000       0.036       0.105
C(playlist_genre)[T.rock]:duration_ms           0.1592      0.017      9.330      0.000       0.126       0.193
==============================================================================
Omnibus:                     2706.291   Durbin-Watson:                   1.228
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1326.963
Skew:                          -0.321   Prob(JB):                    7.14e-289
Kurtosis:                       2.253   Cond. No.                         20.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

fit_m6.params

Intercept                                -0.138817
C(playlist_genre)[T.latin]                0.256172
C(playlist_genre)[T.pop]                  0.297040
C(playlist_genre)[T.r&b]                  0.093727
C(playlist_genre)[T.rap]                  0.055926
                                            ...   
C(playlist_genre)[T.latin]:duration_ms    0.119633
C(playlist_genre)[T.pop]:duration_ms      0.071841
C(playlist_genre)[T.r&b]:duration_ms      0.008481
C(playlist_genre)[T.rap]:duration_ms      0.070652
C(playlist_genre)[T.rock]:duration_ms     0.159195
Length: 66, dtype: float64

fit_m6.pvalues < 0.05

Intercept                                  True
C(playlist_genre)[T.latin]                 True
C(playlist_genre)[T.pop]                   True
C(playlist_genre)[T.r&b]                   True
C(playlist_genre)[T.rap]                   True
                                          ...  
C(playlist_genre)[T.latin]:duration_ms     True
C(playlist_genre)[T.pop]:duration_ms       True
C(playlist_genre)[T.r&b]:duration_ms      False
C(playlist_genre)[T.rap]:duration_ms       True
C(playlist_genre)[T.rock]:duration_ms      True
Length: 66, dtype: bool

my_coefplot2(fit_m6)

fit_m7 = smf.ols('track_popularity ~ C(playlist_genre) * (loudness + danceability + energy + instrumentalness + liveness + valence + tempo + I(tempo**2) + duration_ms + I(duration_ms**2)) + C(key) + C(mode)', data=spotify_model).fit()
print("\nModel 7: Polynomial Features with Genre Interactions")
print(fit_m7.summary())

Model 7: Polynomial Features with Genre Interactions
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       track_popularity   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.108
Method:                 Least Squares   F-statistic:                     52.56
Date:                Mon, 21 Apr 2025   Prob (F-statistic):               0.00
Time:                        19:18:42   Log-Likelihood:                -44675.
No. Observations:               32833   AIC:                         8.951e+04
Df Residuals:                   32755   BIC:                         9.016e+04
Df Model:                          77                                         
Covariance Type:            nonrobust                                         
==================================================================================================================
                                                     coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------------
Intercept                                         -0.1901      0.026     -7.193      0.000      -0.242      -0.138
C(playlist_genre)[T.latin]                         0.2488      0.030      8.320      0.000       0.190       0.307
C(playlist_genre)[T.pop]                           0.3394      0.027     12.623      0.000       0.287       0.392
C(playlist_genre)[T.r&b]                           0.1601      0.030      5.357      0.000       0.102       0.219
C(playlist_genre)[T.rap]                           0.1693      0.031      5.521      0.000       0.109       0.229
C(playlist_genre)[T.rock]                          0.2592      0.031      8.331      0.000       0.198       0.320
C(key)[T.1]                                        0.0111      0.022      0.501      0.617      -0.032       0.054
C(key)[T.2]                                       -0.0235      0.024     -0.979      0.328      -0.071       0.024
C(key)[T.3]                                       -0.0668      0.035     -1.886      0.059      -0.136       0.003
C(key)[T.4]                                       -0.0073      0.026     -0.281      0.779      -0.059       0.044
C(key)[T.5]                                        0.0046      0.025      0.186      0.852      -0.044       0.053
C(key)[T.6]                                        0.0161      0.025      0.652      0.514      -0.032       0.064
C(key)[T.7]                                       -0.0441      0.023     -1.920      0.055      -0.089       0.001
C(key)[T.8]                                        0.0648      0.025      2.580      0.010       0.016       0.114
C(key)[T.9]                                       -0.0215      0.024     -0.907      0.364      -0.068       0.025
C(key)[T.10]                                       0.0280      0.026      1.078      0.281      -0.023       0.079
C(key)[T.11]                                       0.0081      0.024      0.338      0.736      -0.039       0.055
C(mode)[T.1]                                       0.0076      0.011      0.684      0.494      -0.014       0.030
loudness                                           0.0726      0.022      3.262      0.001       0.029       0.116
C(playlist_genre)[T.latin]:loudness                0.2547      0.029      8.809      0.000       0.198       0.311
C(playlist_genre)[T.pop]:loudness                  0.2031      0.031      6.607      0.000       0.143       0.263
C(playlist_genre)[T.r&b]:loudness                  0.1773      0.028      6.281      0.000       0.122       0.233
C(playlist_genre)[T.rap]:loudness                  0.0117      0.029      0.405      0.685      -0.045       0.068
C(playlist_genre)[T.rock]:loudness                -0.0105      0.029     -0.361      0.718      -0.068       0.047
danceability                                       0.0196      0.017      1.148      0.251      -0.014       0.053
C(playlist_genre)[T.latin]:danceability            0.0197      0.026      0.769      0.442      -0.030       0.070
C(playlist_genre)[T.pop]:danceability              0.0706      0.024      2.892      0.004       0.023       0.118
C(playlist_genre)[T.r&b]:danceability             -0.0001      0.023     -0.006      0.995      -0.046       0.045
C(playlist_genre)[T.rap]:danceability              0.0994      0.023      4.286      0.000       0.054       0.145
C(playlist_genre)[T.rock]:danceability             0.0414      0.025      1.629      0.103      -0.008       0.091
energy                                            -0.1922      0.023     -8.386      0.000      -0.237      -0.147
C(playlist_genre)[T.latin]:energy                 -0.1495      0.031     -4.810      0.000      -0.210      -0.089
C(playlist_genre)[T.pop]:energy                   -0.0304      0.031     -0.992      0.321      -0.091       0.030
C(playlist_genre)[T.r&b]:energy                    0.0035      0.029      0.118      0.906      -0.054       0.061
C(playlist_genre)[T.rap]:energy                    0.0468      0.031      1.508      0.132      -0.014       0.108
C(playlist_genre)[T.rock]:energy                   0.1030      0.031      3.332      0.001       0.042       0.164
instrumentalness                                  -0.0748      0.010     -7.615      0.000      -0.094      -0.056
C(playlist_genre)[T.latin]:instrumentalness       -0.0059      0.021     -0.280      0.779      -0.047       0.036
C(playlist_genre)[T.pop]:instrumentalness         -0.0177      0.020     -0.906      0.365      -0.056       0.021
C(playlist_genre)[T.r&b]:instrumentalness          0.0003      0.026      0.011      0.991      -0.051       0.051
C(playlist_genre)[T.rap]:instrumentalness          0.0687      0.018      3.909      0.000       0.034       0.103
C(playlist_genre)[T.rock]:instrumentalness        -0.0304      0.020     -1.511      0.131      -0.070       0.009
liveness                                          -0.0075      0.011     -0.667      0.505      -0.029       0.014
C(playlist_genre)[T.latin]:liveness               -0.0070      0.018     -0.397      0.692      -0.042       0.028
C(playlist_genre)[T.pop]:liveness                  0.0132      0.018      0.713      0.476      -0.023       0.049
C(playlist_genre)[T.r&b]:liveness                 -0.0386      0.018     -2.123      0.034      -0.074      -0.003
C(playlist_genre)[T.rap]:liveness                  0.0054      0.017      0.313      0.754      -0.029       0.039
C(playlist_genre)[T.rock]:liveness                -0.0540      0.017     -3.210      0.001      -0.087      -0.021
valence                                            0.0582      0.014      4.103      0.000       0.030       0.086
C(playlist_genre)[T.latin]:valence                -0.0786      0.021     -3.690      0.000      -0.120      -0.037
C(playlist_genre)[T.pop]:valence                  -0.0625      0.021     -2.944      0.003      -0.104      -0.021
C(playlist_genre)[T.r&b]:valence                  -0.1586      0.021     -7.403      0.000      -0.201      -0.117
C(playlist_genre)[T.rap]:valence                  -0.0750      0.020     -3.729      0.000      -0.114      -0.036
C(playlist_genre)[T.rock]:valence                 -0.0700      0.022     -3.138      0.002      -0.114      -0.026
tempo                                             -0.1169      0.025     -4.758      0.000      -0.165      -0.069
C(playlist_genre)[T.latin]:tempo                   0.0957      0.029      3.303      0.001       0.039       0.152
C(playlist_genre)[T.pop]:tempo                     0.1025      0.029      3.526      0.000       0.046       0.160
C(playlist_genre)[T.r&b]:tempo                     0.1576      0.028      5.700      0.000       0.103       0.212
C(playlist_genre)[T.rap]:tempo                     0.1767      0.027      6.539      0.000       0.124       0.230
C(playlist_genre)[T.rock]:tempo                    0.1365      0.029      4.774      0.000       0.080       0.193
I(tempo ** 2)                                      0.0792      0.015      5.232      0.000       0.050       0.109
C(playlist_genre)[T.latin]:I(tempo ** 2)          -0.0206      0.019     -1.109      0.268      -0.057       0.016
C(playlist_genre)[T.pop]:I(tempo ** 2)            -0.0677      0.019     -3.582      0.000      -0.105      -0.031
C(playlist_genre)[T.r&b]:I(tempo ** 2)            -0.1037      0.018     -5.819      0.000      -0.139      -0.069
C(playlist_genre)[T.rap]:I(tempo ** 2)            -0.1075      0.018     -5.927      0.000      -0.143      -0.072
C(playlist_genre)[T.rock]:I(tempo ** 2)           -0.0721      0.018     -3.928      0.000      -0.108      -0.036
duration_ms                                       -0.1835      0.015    -12.129      0.000      -0.213      -0.154
C(playlist_genre)[T.latin]:duration_ms             0.1512      0.023      6.455      0.000       0.105       0.197
C(playlist_genre)[T.pop]:duration_ms               0.0877      0.025      3.535      0.000       0.039       0.136
C(playlist_genre)[T.r&b]:duration_ms               0.0042      0.022      0.191      0.849      -0.039       0.047
C(playlist_genre)[T.rap]:duration_ms               0.0908      0.020      4.487      0.000       0.051       0.130
C(playlist_genre)[T.rock]:duration_ms              0.2328      0.023      9.948      0.000       0.187       0.279
I(duration_ms ** 2)                                0.0112      0.006      1.808      0.071      -0.001       0.023
C(playlist_genre)[T.latin]:I(duration_ms ** 2)    -0.0274      0.011     -2.435      0.015      -0.050      -0.005
C(playlist_genre)[T.pop]:I(duration_ms ** 2)      -0.0108      0.012     -0.884      0.377      -0.035       0.013
C(playlist_genre)[T.r&b]:I(duration_ms ** 2)       0.0065      0.010      0.632      0.528      -0.014       0.027
C(playlist_genre)[T.rap]:I(duration_ms ** 2)      -0.0271      0.010     -2.629      0.009      -0.047      -0.007
C(playlist_genre)[T.rock]:I(duration_ms ** 2)     -0.0412      0.009     -4.604      0.000      -0.059      -0.024
==============================================================================
Omnibus:                     2649.621   Durbin-Watson:                   1.237
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1328.566
Skew:                          -0.325   Prob(JB):                    3.20e-289
Kurtosis:                       2.260   Cond. No.                         41.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

summary_df = fit_m7.summary2().tables[1]  


sig_coefs = summary_df[summary_df['P>|t|'] < 0.05]


num_positive = (sig_coefs['Coef.'] > 0).sum()
num_negative = (sig_coefs['Coef.'] < 0).sum()

print(f"Statistically significant positive coefficients: {num_positive}")
print(f"Statistically significant negative coefficients: {num_negative}")

Statistically significant positive coefficients: 25
Statistically significant negative coefficients: 20

my_coefplot2(fit_m7)

spotify_model['key_std'] = (spotify_model['key'] - spotify_model['key'].mean()) / spotify_model['key'].std()
spotify_model['sin_key'] = np.sin(2 * np.pi * spotify_model['key_std'] / 12)

# Model 8: Sin Key with Genre Interactions
fit_m8 = smf.ols('track_popularity ~ C(playlist_genre) * (loudness + danceability + energy + instrumentalness + liveness + valence + tempo + duration_ms + sin_key) + C(mode)', data=spotify_model).fit()
print("\nModel 8: Sin Key with Genre Interactions")
print(fit_m8.summary())

Model 8: Sin Key with Genre Interactions
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       track_popularity   R-squared:                       0.106
Model:                            OLS   Adj. R-squared:                  0.105
Method:                 Least Squares   F-statistic:                     65.00
Date:                Mon, 21 Apr 2025   Prob (F-statistic):               0.00
Time:                        19:18:43   Log-Likelihood:                -44742.
No. Observations:               32833   AIC:                         8.961e+04
Df Residuals:                   32772   BIC:                         9.012e+04
Df Model:                          60                                         
Covariance Type:            nonrobust                                         
===============================================================================================================
                                                  coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
Intercept                                      -0.1370      0.018     -7.587      0.000      -0.172      -0.102
C(playlist_genre)[T.latin]                      0.2543      0.023     10.954      0.000       0.209       0.300
C(playlist_genre)[T.pop]                        0.2959      0.022     13.601      0.000       0.253       0.339
C(playlist_genre)[T.r&b]                        0.0933      0.024      3.839      0.000       0.046       0.141
C(playlist_genre)[T.rap]                        0.0563      0.022      2.518      0.012       0.012       0.100
C(playlist_genre)[T.rock]                       0.1925      0.028      6.762      0.000       0.137       0.248
C(mode)[T.1]                                    0.0064      0.011      0.594      0.552      -0.015       0.028
loudness                                        0.0693      0.022      3.150      0.002       0.026       0.112
C(playlist_genre)[T.latin]:loudness             0.2791      0.029      9.781      0.000       0.223       0.335
C(playlist_genre)[T.pop]:loudness               0.2080      0.031      6.806      0.000       0.148       0.268
C(playlist_genre)[T.r&b]:loudness               0.1777      0.028      6.333      0.000       0.123       0.233
C(playlist_genre)[T.rap]:loudness               0.0171      0.029      0.597      0.551      -0.039       0.073
C(playlist_genre)[T.rock]:loudness             -0.0038      0.029     -0.132      0.895      -0.061       0.053
danceability                                    0.0001      0.016      0.006      0.995      -0.032       0.032
C(playlist_genre)[T.latin]:danceability         0.0177      0.025      0.712      0.476      -0.031       0.066
C(playlist_genre)[T.pop]:danceability           0.0853      0.023      3.681      0.000       0.040       0.131
C(playlist_genre)[T.r&b]:danceability           0.0319      0.022      1.433      0.152      -0.012       0.075
C(playlist_genre)[T.rap]:danceability           0.1363      0.022      6.170      0.000       0.093       0.180
C(playlist_genre)[T.rock]:danceability          0.0624      0.024      2.591      0.010       0.015       0.110
energy                                         -0.2134      0.022     -9.520      0.000      -0.257      -0.169
C(playlist_genre)[T.latin]:energy              -0.1507      0.030     -4.955      0.000      -0.210      -0.091
C(playlist_genre)[T.pop]:energy                -0.0139      0.030     -0.462      0.644      -0.073       0.045
C(playlist_genre)[T.r&b]:energy                 0.0298      0.029      1.030      0.303      -0.027       0.087
C(playlist_genre)[T.rap]:energy                 0.0761      0.031      2.488      0.013       0.016       0.136
C(playlist_genre)[T.rock]:energy                0.1267      0.030      4.166      0.000       0.067       0.186
instrumentalness                               -0.0740      0.010     -7.562      0.000      -0.093      -0.055
C(playlist_genre)[T.latin]:instrumentalness    -0.0121      0.021     -0.576      0.565      -0.053       0.029
C(playlist_genre)[T.pop]:instrumentalness      -0.0188      0.019     -0.969      0.333      -0.057       0.019
C(playlist_genre)[T.r&b]:instrumentalness       0.0077      0.026      0.297      0.767      -0.043       0.058
C(playlist_genre)[T.rap]:instrumentalness       0.0612      0.017      3.536      0.000       0.027       0.095
C(playlist_genre)[T.rock]:instrumentalness     -0.0356      0.020     -1.769      0.077      -0.075       0.004
liveness                                       -0.0105      0.011     -0.942      0.346      -0.032       0.011
C(playlist_genre)[T.latin]:liveness            -0.0044      0.018     -0.251      0.802      -0.039       0.030
C(playlist_genre)[T.pop]:liveness               0.0156      0.019      0.843      0.399      -0.021       0.052
C(playlist_genre)[T.r&b]:liveness              -0.0342      0.018     -1.880      0.060      -0.070       0.001
C(playlist_genre)[T.rap]:liveness               0.0079      0.017      0.454      0.650      -0.026       0.042
C(playlist_genre)[T.rock]:liveness             -0.0524      0.017     -3.113      0.002      -0.085      -0.019
valence                                         0.0685      0.014      4.861      0.000       0.041       0.096
C(playlist_genre)[T.latin]:valence             -0.0708      0.021     -3.370      0.001      -0.112      -0.030
C(playlist_genre)[T.pop]:valence               -0.0711      0.021     -3.375      0.001      -0.112      -0.030
C(playlist_genre)[T.r&b]:valence               -0.1744      0.021     -8.199      0.000      -0.216      -0.133
C(playlist_genre)[T.rap]:valence               -0.0947      0.020     -4.770      0.000      -0.134      -0.056
C(playlist_genre)[T.rock]:valence              -0.0866      0.022     -3.898      0.000      -0.130      -0.043
tempo                                          -0.0602      0.022     -2.721      0.007      -0.104      -0.017
C(playlist_genre)[T.latin]:tempo                0.0878      0.025      3.454      0.001       0.038       0.138
C(playlist_genre)[T.pop]:tempo                  0.0513      0.026      1.943      0.052      -0.000       0.103
C(playlist_genre)[T.r&b]:tempo                  0.0923      0.025      3.649      0.000       0.043       0.142
C(playlist_genre)[T.rap]:tempo                  0.1110      0.025      4.512      0.000       0.063       0.159
C(playlist_genre)[T.rock]:tempo                 0.0823      0.026      3.168      0.002       0.031       0.133
duration_ms                                    -0.1696      0.011    -14.893      0.000      -0.192      -0.147
C(playlist_genre)[T.latin]:duration_ms          0.1193      0.020      5.967      0.000       0.080       0.158
C(playlist_genre)[T.pop]:duration_ms            0.0723      0.021      3.428      0.001       0.031       0.114
C(playlist_genre)[T.r&b]:duration_ms            0.0107      0.018      0.606      0.544      -0.024       0.045
C(playlist_genre)[T.rap]:duration_ms            0.0710      0.018      4.052      0.000       0.037       0.105
C(playlist_genre)[T.rock]:duration_ms           0.1594      0.017      9.337      0.000       0.126       0.193
sin_key                                         0.0453      0.026      1.768      0.077      -0.005       0.096
C(playlist_genre)[T.latin]:sin_key             -0.0570      0.037     -1.530      0.126      -0.130       0.016
C(playlist_genre)[T.pop]:sin_key               -0.0142      0.037     -0.389      0.697      -0.086       0.058
C(playlist_genre)[T.r&b]:sin_key               -0.0566      0.037     -1.532      0.126      -0.129       0.016
C(playlist_genre)[T.rap]:sin_key               -0.0230      0.036     -0.640      0.522      -0.093       0.047
C(playlist_genre)[T.rock]:sin_key              -0.0620      0.038     -1.622      0.105      -0.137       0.013
==============================================================================
Omnibus:                     2743.366   Durbin-Watson:                   1.228
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1332.943
Skew:                          -0.320   Prob(JB):                    3.59e-290
Kurtosis:                       2.249   Cond. No.                         20.9
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

summary_df = fit_m8.summary2().tables[1]  


sig_coefs = summary_df[summary_df['P>|t|'] < 0.05]


num_positive = (sig_coefs['Coef.'] > 0).sum()
num_negative = (sig_coefs['Coef.'] < 0).sum()

print(f"Statistically significant positive coefficients: {num_positive}")
print(f"Statistically significant negative coefficients: {num_negative}")

Statistically significant positive coefficients: 24
Statistically significant negative coefficients: 12

my_coefplot2(fit_m8)

def plot_predicted_vs_observed(fit_m, df_viz, var_study):
    """
    This is a function that takes a fitted model, a dataframe for visualization, and
    a string that identifies the variable of interest, then generates a graph of the
    prediction interval, confidence interval, and trend for that model with respect to
    the variable of interest.
    """
    fig, ax = plt.subplots()
    
    # get predictions
    predictions = fit_m.get_prediction(df_viz)
    fit_m_summary = predictions.summary_frame()
    
    # prediction interval
    ax.fill_between(df_viz[var_study],
                 fit_m_summary.obs_ci_lower, fit_m_summary.obs_ci_upper,
                 facecolor='orange', alpha=0.75, edgecolor='orange')
    
    # confidence interval
    ax.fill_between(df_viz[var_study],
                 fit_m_summary.mean_ci_lower, fit_m_summary.mean_ci_upper,
                 facecolor='grey', edgecolor='grey')
    
    # trend
    ax.plot(df_viz[var_study], fit_m_summary['mean'], color='k', linewidth=1)
    
    # set the labels
    ax.set_xlabel(var_study)
    ax.set_ylabel('output')
    
    # show the plot
    plt.show()

spotify_viz = pd.DataFrame({'energy': np.linspace(spotify_model.energy.min()-0.02, spotify_model.energy.max()+0.02, num=251)})

spotify_viz['danceability'] = spotify_model.danceability.mean()
spotify_viz['loudness'] = spotify_model.loudness.mean()
spotify_viz['speechiness'] = spotify_model.speechiness.mean()
spotify_viz['danceability'] = spotify_model.danceability.mean()
spotify_viz['acousticness'] = spotify_model.acousticness.mean()
spotify_viz['instrumentalness'] = spotify_model.instrumentalness.mean()
spotify_viz['liveness'] = spotify_model.liveness.mean()
spotify_viz['valence'] = spotify_model.valence.mean()
spotify_viz['tempo'] = spotify_model.tempo.mean()
spotify_viz['sin_key'] = spotify_model.sin_key.mean()
spotify_viz['duration_ms'] = spotify_model.duration_ms.mean()



spotify_viz['playlist_genre'] = spotify_model['playlist_genre'].mode()[0]
spotify_viz['key'] = spotify_model['key'].mode()[0]
spotify_viz['mode'] = spotify_model['mode'].mode()[0]

plot_predicted_vs_observed(fit_m1, spotify_viz, 'energy')
plot_predicted_vs_observed(fit_m1, spotify_model, 'energy')

plot_predicted_vs_observed(fit_m2, spotify_viz, 'energy')
plot_predicted_vs_observed(fit_m2, spotify_model, 'energy')

plot_predicted_vs_observed(fit_m3, spotify_viz, 'energy')
plot_predicted_vs_observed(fit_m3, spotify_model, 'energy')

plot_predicted_vs_observed(fit_m4, spotify_viz, 'energy')
plot_predicted_vs_observed(fit_m4, spotify_model, 'energy')

plot_predicted_vs_observed(fit_m5, spotify_viz, 'energy')
plot_predicted_vs_observed(fit_m5, spotify_model, 'energy')

plot_predicted_vs_observed(fit_m6, spotify_viz, 'energy')
plot_predicted_vs_observed(fit_m6, spotify_model, 'energy')

plot_predicted_vs_observed(fit_m7, spotify_viz, 'energy')
plot_predicted_vs_observed(fit_m7, spotify_model, 'energy')

plot_predicted_vs_observed(fit_m8, spotify_viz, 'energy')
plot_predicted_vs_observed(fit_m8, spotify_model, 'energy')

print('=== R-SQUARED VALUES ===')
print('m1: ' + str(fit_m1.rsquared))
print('m2: ' + str(fit_m2.rsquared))
print('m3: ' + str(fit_m3.rsquared))
print('m4: ' + str(fit_m4.rsquared))
print('m5: ' + str(fit_m5.rsquared))
print('m6: ' + str(fit_m6.rsquared))
print('m7: ' + str(fit_m7.rsquared))
print('m8: ' + str(fit_m8.rsquared))
print('\nThe best R-squared value is ' + str(max([fit_m1.rsquared, fit_m2.rsquared, fit_m3.rsquared, fit_m4.rsquared,
                                                 fit_m5.rsquared, fit_m6.rsquared, fit_m7.rsquared, fit_m8.rsquared])) + '.\n')

print('=== RMSE VALUES ===')
print('m1: ' + str(np.sqrt((fit_m1.resid ** 2).mean())))
print('m2: ' + str(np.sqrt((fit_m2.resid ** 2).mean())))
print('m3: ' + str(np.sqrt((fit_m3.resid ** 2).mean())))
print('m4: ' + str(np.sqrt((fit_m4.resid ** 2).mean())))
print('m5: ' + str(np.sqrt((fit_m5.resid ** 2).mean())))
print('m6: ' + str(np.sqrt((fit_m6.resid ** 2).mean())))
print('m7: ' + str(np.sqrt((fit_m7.resid ** 2).mean())))
print('m8: ' + str(np.sqrt((fit_m8.resid ** 2).mean())))
print('\nThe best RMSE value is ' + str(min([np.sqrt((fit_m1.resid ** 2).mean()), np.sqrt((fit_m2.resid ** 2).mean()), np.sqrt((fit_m3.resid ** 2).mean()), np.sqrt((fit_m4.resid ** 2).mean()),
                                            np.sqrt((fit_m5.resid ** 2).mean()), np.sqrt((fit_m6.resid ** 2).mean()), np.sqrt((fit_m7.resid ** 2).mean()), np.sqrt((fit_m8.resid ** 2).mean())])) + '.')

=== R-SQUARED VALUES ===
m1: 0.0
m2: 0.03182580693780346
m3: 0.07063377084127798
m4: 0.09139462157677147
m5: 0.08394779252405382
m6: 0.10698653520005363
m7: 0.10997460964366834
m8: 0.10634520057504149

The best R-squared value is 0.10997460964366834.

=== RMSE VALUES ===
m1: 0.9999999999999999
m2: 0.9839584305559846
m3: 0.9640364252240273
m4: 0.9532079408099938
m5: 0.957106163116687
m6: 0.9449938966998391
m7: 0.9434115699716277
m8: 0.9453331684781605

The best RMSE value is 0.9434115699716277.

spotify_predict_1 = pd.DataFrame([(energy, duration_ms, loudness) for energy in np.linspace(spotify_model.energy.min(), spotify_model.energy.max(), num=101)
                                     for duration_ms in np.linspace(spotify_model.duration_ms.min(), spotify_model.duration_ms.max(), num=5)
                                     for loudness in np.linspace(spotify_model.loudness.min(), spotify_model.loudness.max(), num=5)], 
                       columns=['energy', 'duration_ms', 'loudness'])

spotify_predict_1['danceability'] = spotify_model.danceability.mean()
spotify_predict_1['speechiness'] = spotify_model.speechiness.mean()
spotify_predict_1['danceability'] = spotify_model.danceability.mean()
spotify_predict_1['acousticness'] = spotify_model.acousticness.mean()
spotify_predict_1['instrumentalness'] = spotify_model.instrumentalness.mean()
spotify_predict_1['liveness'] = spotify_model.liveness.mean()
spotify_predict_1['valence'] = spotify_model.valence.mean()
spotify_predict_1['tempo'] = spotify_model.tempo.mean()
spotify_predict_1['sin_key'] = spotify_model.sin_key.mean()





spotify_predict_1['playlist_genre'] = spotify_model['playlist_genre'].mode()[0]
spotify_predict_1['key'] = spotify_model['key'].mode()[0]
spotify_predict_1['mode'] = spotify_model['mode'].mode()[0]

spotify_predict_1_copy = spotify_predict_1.copy()
spotify_predict_1_copy['pred'] = fit_m4.predict(spotify_predict_1)

sns.relplot(data = spotify_predict_1_copy, 
            x='energy', y='pred', kind='line',
            hue='duration_ms', col='loudness',
            col_wrap=3, palette='coolwarm',
            estimator=None, units='loudness')

plt.show()

spotify_predict_2_copy = spotify_predict_1.copy()
spotify_predict_2_copy['pred'] = fit_m7.predict(spotify_predict_2_copy)

sns.relplot(data = spotify_predict_2_copy, 
            x='energy', y='pred', kind='line',
            hue='duration_ms', col='loudness',
            col_wrap=3, palette='coolwarm',
            estimator=None, units='loudness')

plt.show()

from sklearn.model_selection import KFold # for regression

kf_r = KFold(n_splits=5, shuffle=True, random_state=101)

def std_train_test_linear_with_cv(mod_name, a_formula, data_df, x_names, y_name, std_vars, cv):
    """
    This is a function that takes a model name, a formula, a source dataframe, a list of
    the input names, the output name, a list of variables to be standardized, and a K-fold
    cross-validation generator object.
    
    Then, the function uses this information to split the inputs and outputs from the
    source data, splits the data using the given K-fold generator, standardizes the data
    within each fold using StandardScaler for the specified variables, fits a linear model
    using the given formula, makes predictions within each fold, calculates the R-squared
    and RMSE values within each fold, and stores the results along with model information
    into a new results dataframe.
    
    This dataframe is then returned.
    """
    # separate the inputs and output
    input_df = data_df.loc[:, x_names].copy()

    # initialize the performance metric storage lists
    test_res_r2 = []
    test_res_rmse = []

    # split the data, preprocess and iterate over the folds
    for train_id, test_id in cv.split(input_df.to_numpy(), data_df[y_name].to_numpy()):
    
        # subset the training and test splits within each fold
        train_data = data_df.iloc[train_id, :].copy()
        test_data = data_df.iloc[test_id, :].copy()
    
        # standardize
        train_data[std_vars] = StandardScaler().fit_transform(train_data[std_vars])
        test_data[std_vars] = StandardScaler().fit_transform(test_data[std_vars])

        # fit the model on the training data within the current fold
        a_mod = smf.ols(formula=a_formula, data=train_data).fit()
    
        # predict the testing within the fold
        test_copy = test_data.copy()
        test_copy['pred'] = a_mod.predict(test_data)
    
        # calculate performance metric on testing set within fold
        test_res_r2.append(a_mod.rsquared)
        test_res_rmse.append(np.sqrt((a_mod.resid ** 2 ).mean()))
    
    # book keeping to store the results
    res_df = pd.DataFrame({'R-squared': test_res_r2, 'RMSE': test_res_rmse})
    res_df['from_set'] = 'testing'
    res_df['fold_id'] = res_df.index + 1
        
    # add information about the model
    res_df['model_name'] = mod_name
    res_df['model_formula'] = a_formula
    res_df['num_coefs'] = len(a_mod.params)
    
    return res_df

input_names = [
    'danceability', 'energy', 'loudness', 'instrumentalness',
    'liveness', 'valence', 'tempo', 'duration_ms'
]

lin_standard = [
    'danceability', 'energy', 'loudness', 'instrumentalness',
    'liveness', 'valence', 'tempo', 'duration_ms', 'track_popularity'
]

log_standard = [
    'danceability', 'energy', 'loudness', 'instrumentalness',
    'liveness', 'valence', 'tempo', 'duration_ms'
]

res_m3 = std_train_test_linear_with_cv('m3', 'track_popularity ~ (danceability + energy + loudness + danceability + instrumentalness + liveness + valence + tempo + duration_ms)', spotify_model, input_names, 'track_popularity', lin_standard, kf_r)

res_m7 = std_train_test_linear_with_cv('m7', 'track_popularity ~ C(playlist_genre) * (loudness + danceability + energy + instrumentalness + liveness + valence + tempo + I(tempo**2) + duration_ms + I(duration_ms**2)) + C(key) + C(mode)', spotify_model, input_names, 'track_popularity', lin_standard, kf_r)

res_m8 = std_train_test_linear_with_cv('m8', 'track_popularity ~ C(playlist_genre) * (loudness + danceability + energy + instrumentalness + liveness + valence + tempo + duration_ms + sin_key) + C(mode)', spotify_model, input_names, 'track_popularity', lin_standard, kf_r)
print("\nModel 8: Polynomial Features with Genre Interactions")

Model 8: Polynomial Features with Genre Interactions

cv_results_r_temp = pd.concat([res_m3, res_m7], ignore_index=True)
cv_results_r = pd.concat([cv_results_r_temp, res_m8], ignore_index=True)

cv_results_r

fig, ax = plt.subplots(2, 2, figsize=(8,8))

sns.pointplot(data=cv_results_r, x='model_name', y='R-squared', errorbar=('ci', 68), linestyle='none', ax=ax[0,0])
sns.pointplot(data=cv_results_r, x='model_name', y='RMSE', errorbar=('ci', 68), linestyle='none', ax=ax[0,1])
sns.pointplot(data=cv_results_r, x='model_name', y='R-squared', errorbar=('ci', 95), linestyle='none', ax=ax[1,0])
sns.pointplot(data=cv_results_r, x='model_name', y='RMSE', errorbar=('ci', 95), linestyle='none', ax=ax[1,1])

ax[0,0].set_title('R-squared (68% CI)')
ax[0,1].set_title('RMSE (68% CI)')
ax[1,0].set_title('R-squared (95% CI)')
ax[1,1].set_title('RMSE (95% CI)')

for ax_i in ax.flat:
    ax_i.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

target = 'track_popularity'
features = ['loudness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
X = spotify_model[features]
y = spotify_model[target]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


alphas = np.logspace(-3, 3, 50)

ridge_cv = RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X_scaled, y)

RidgeCV(alphas=array([1.00000000e-03, 1.32571137e-03, 1.75751062e-03, 2.32995181e-03,
       3.08884360e-03, 4.09491506e-03, 5.42867544e-03, 7.19685673e-03,
       9.54095476e-03, 1.26485522e-02, 1.67683294e-02, 2.22299648e-02,
       2.94705170e-02, 3.90693994e-02, 5.17947468e-02, 6.86648845e-02,
       9.10298178e-02, 1.20679264e-01, 1.59985872e-01, 2.12095089e-01,
       2.81176870e-01, 3.72759372e-0...
       2.68269580e+00, 3.55648031e+00, 4.71486636e+00, 6.25055193e+00,
       8.28642773e+00, 1.09854114e+01, 1.45634848e+01, 1.93069773e+01,
       2.55954792e+01, 3.39322177e+01, 4.49843267e+01, 5.96362332e+01,
       7.90604321e+01, 1.04811313e+02, 1.38949549e+02, 1.84206997e+02,
       2.44205309e+02, 3.23745754e+02, 4.29193426e+02, 5.68986603e+02,
       7.54312006e+02, 1.00000000e+03]),
        cv=5, scoring='neg_mean_squared_error')

RidgeCV(alphas=array([1.00000000e-03, 1.32571137e-03, 1.75751062e-03, 2.32995181e-03,
       3.08884360e-03, 4.09491506e-03, 5.42867544e-03, 7.19685673e-03,
       9.54095476e-03, 1.26485522e-02, 1.67683294e-02, 2.22299648e-02,
       2.94705170e-02, 3.90693994e-02, 5.17947468e-02, 6.86648845e-02,
       9.10298178e-02, 1.20679264e-01, 1.59985872e-01, 2.12095089e-01,
       2.81176870e-01, 3.72759372e-0...
       2.68269580e+00, 3.55648031e+00, 4.71486636e+00, 6.25055193e+00,
       8.28642773e+00, 1.09854114e+01, 1.45634848e+01, 1.93069773e+01,
       2.55954792e+01, 3.39322177e+01, 4.49843267e+01, 5.96362332e+01,
       7.90604321e+01, 1.04811313e+02, 1.38949549e+02, 1.84206997e+02,
       2.44205309e+02, 3.23745754e+02, 4.29193426e+02, 5.68986603e+02,
       7.54312006e+02, 1.00000000e+03]),
        cv=5, scoring='neg_mean_squared_error')

cv_mse = [-mean_squared_error(y, ridge_cv.predict(X_scaled), sample_weight=None) for alpha in alphas]

# For visualization, we use the best alpha and its score
best_alpha = ridge_cv.alpha_
best_mse = -ridge_cv.best_score_

# Plot 1: CV MSE vs. Alpha (approximated)
plt.figure(figsize=(8, 6))
plt.semilogx(alphas, cv_mse, '-o')
plt.axvline(best_alpha, color='red', linestyle='--', label=f'Best alpha = {best_alpha:.4f}')
plt.xlabel('Alpha (log scale)')
plt.ylabel('Cross-Validation MSE')
plt.title('Ridge Regression: CV MSE vs. Alpha')
plt.legend()
plt.grid(True)
plt.show()

# Plot 2: Coefficients of the tuned model
plt.figure(figsize=(10, 6))
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': ridge_cv.coef_})
coef_df = coef_df.sort_values('Coefficient', ascending=False)
plt.bar(coef_df['Feature'], coef_df['Coefficient'])
plt.xticks(rotation=45)
plt.xlabel('Feature')
plt.ylabel('Coefficient Value')
plt.title(f'Ridge Regression Coefficients (alpha = {best_alpha:.4f})')
plt.tight_layout()
plt.show()

lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000)
lasso_cv.fit(X_scaled, y)
cv_mse = lasso_cv.mse_path_.mean(axis=1)  


best_alpha = lasso_cv.alpha_
best_mse = cv_mse[np.where(lasso_cv.alphas_ == best_alpha)[0][0]]

plt.figure(figsize=(8, 6))
plt.semilogx(lasso_cv.alphas_, cv_mse, '-o')
plt.axvline(best_alpha, color='red', linestyle='--', label=f'Best alpha = {best_alpha:.4f}')
plt.xlabel('Alpha (log scale)')
plt.ylabel('Cross-Validation MSE')
plt.title('Lasso Regression: CV MSE vs. Alpha')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': lasso_cv.coef_})
coef_df = coef_df.sort_values('Coefficient', ascending=False)
plt.bar(coef_df['Feature'], coef_df['Coefficient'])
plt.xticks(rotation=45)
plt.xlabel('Feature')
plt.ylabel('Coefficient Value')
plt.title(f'Lasso Regression Coefficients (alpha = {best_alpha:.4f})')
plt.tight_layout()
plt.show()

	danceability	energy	loudness	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	count
k6
0	0.663	0.783	-6.972	0.071	0.072	0.746	0.171	0.387	125.088	252107.027	2470
1	0.546	0.791	-5.327	0.071	0.068	0.023	0.173	0.379	132.634	227385.497	8463
2	0.610	0.780	-5.976	0.113	0.114	0.054	0.614	0.510	121.547	231937.778	2128
3	0.726	0.663	-6.823	0.313	0.185	0.010	0.174	0.544	122.614	215458.664	4322
4	0.740	0.724	-6.212	0.075	0.142	0.014	0.149	0.680	113.552	220875.123	10768
5	0.605	0.425	-10.513	0.073	0.519	0.093	0.148	0.393	112.372	227137.577	4682

	R-squared	RMSE	from_set	fold_id	model_name	model_formula	num_coefs
0	0.069420	0.964666	testing	1	m3	track_popularity ~ (danceability + energy + lo...	9
1	0.071928	0.963365	testing	2	m3	track_popularity ~ (danceability + energy + lo...	9
2	0.071632	0.963518	testing	3	m3	track_popularity ~ (danceability + energy + lo...	9
3	0.072386	0.963127	testing	4	m3	track_popularity ~ (danceability + energy + lo...	9
4	0.068038	0.965382	testing	5	m3	track_popularity ~ (danceability + energy + lo...	9
5	0.110741	0.943005	testing	1	m7	track_popularity ~ C(playlist_genre) * (loudne...	78
6	0.110849	0.942948	testing	2	m7	track_popularity ~ C(playlist_genre) * (loudne...	78
7	0.110899	0.942922	testing	3	m7	track_popularity ~ C(playlist_genre) * (loudne...	78
8	0.111149	0.942789	testing	4	m7	track_popularity ~ C(playlist_genre) * (loudne...	78
9	0.109010	0.943923	testing	5	m7	track_popularity ~ C(playlist_genre) * (loudne...	78
10	0.106954	0.945011	testing	1	m8	track_popularity ~ C(playlist_genre) * (loudne...	61
11	0.107469	0.944739	testing	2	m8	track_popularity ~ C(playlist_genre) * (loudne...	61
12	0.106479	0.945262	testing	3	m8	track_popularity ~ C(playlist_genre) * (loudne...	61
13	0.107552	0.944695	testing	4	m8	track_popularity ~ C(playlist_genre) * (loudne...	61
14	0.105368	0.945850	testing	5	m8	track_popularity ~ C(playlist_genre) * (loudne...	61

CMPINF 2100 Final Project¶

Spotify Data Set¶

Joshua Arford¶

Part 1: Introduction¶

a. It is recommended that you complete the introduction after you have finished all the other parts.¶

b. State whether you solved your project as a regression or classification problem.¶

c. The general proposal comments including instructions for how to convert a regression problem to a classification problem if that is what you are interested in working on.¶

d. Describe the major findings you have identified based on your analysis.¶

Predictive Power Is Moderate¶

Model Stability Across Folds¶

Standardization Was Crucial¶

The Best Model¶

e. Which inputs/features seem to influence the response/outcome the most?¶

f. What supported your conclusion? Was it only through predictive models?¶

g. Could EDA help identify similar trends/relationships?¶

h. Was clustering consistent with any conclusions from the predictive models?¶

i. What skills did you learn from going through this project?¶

Modeling¶

Cross-Validation & Evaluation¶

Feature Importance & Interpretation¶

Clustering Analysis¶

EDA as a Foundation¶

Critical Thinking & Communication¶

j. This is not related to application or project inputs/outputs directly. What general skills can you take away from the project to apply to applications more specific to your area of interest?¶

End-to-End Project Execution¶

Model Interpretation & Communication¶

Part 2: Exploratory Data Analysis¶

Marginal Distrubutions of Continuous Input Variables¶

Marginal Distrubutions of Response Variable (Transformation Included)¶

Marginal Distrubutions of Categorical Variables¶

Variable Relationships¶

Distributions grouped by categories¶

Lastly, we can use two categorical variables at the same time with track_popularity¶

Part 3: Clustering¶

K-Means¶

Hierarchial¶

Interpretation¶

Part 4:) Models: Fitting and Interpretation¶

Model 1 Intercept-only¶

Interpretation¶

Model 2 Categorical inputs with additive features¶

Interpretation¶

Model 3: Continuous inputs with linear additive features¶

Interpretation¶

Model 4: All inputs(Continuous and categorical with linear and additive features)¶

Interpretation¶

Model 5: v. Continuous inputs with linear main effect and pair-wise interactions.¶

Interpretation¶

Model 6 Interact the categorical inputs with the continuous inputs. This model must include the linear main effects as well¶

Interpretation¶

Model 7: Polynomial Features with Genre Interactions¶

Interpretation¶

Model 8: Sin(Key) with Genre Interactions¶

Interpretation¶

Performance and Interpretation of the 8 models¶

Model 1¶

Model 2¶

Model 3¶

Model 4¶

Model 5¶

Model 6¶

Model 7¶

Model 8¶

Part 5: Model Predictions¶

We will use model 4 and 7¶

Visualization 1 : ALL inputs and linear additive features.¶

Visualization 2 : Polynomial Features with Genre Interactions (Best Predicted vs Observed)¶

Part 6: Performance and Validation¶

Model 3¶

Model 7¶

Model 8¶

d. Which model is the BEST according to CROSS-VALIDATION?¶

e. Is this model DIFFERENT from the model identified as the BEST according to the training set?¶

f. How many regression coefficients are associated with the best model?¶

Other types of regression¶

Ridge Regression¶

Lasso Regression¶

Lastly, we can use two categorical variables at the same time with `track_popularity`¶