import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data
podcast_train = pd.read_csv("podcast_train.csv")
podcast_test = pd.read_csv("podcast_test.csv")
podcast_submission = pd.read_csv("podcast_sample_submission.csv")

# Preview the training data
print("Training Data Preview:")
display(podcast_train.head())

# Preview the test data
print("Test Data Preview:")
display(podcast_test.head())

# Preview the submission data
print("Sample Submission Data Preview:")
display(podcast_submission.head())

Training Data Preview:

Test Data Preview:

Sample Submission Data Preview:

# Data Overview
print("Training Data Info:\n")
display(podcast_train.info())

# Stat summary
print("Training Data Statistical Summary:")
display(podcast_train.describe().round(2))

# Missing value summary sorted by most missing
missing_values = podcast_train.isnull().sum().sort_values(ascending=False)
missing_percent = ((missing_values / len(podcast_train)) * 100).round(2)
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percent': missing_percent})
print("Training Data Missing Values Summary:")
display(missing_df[missing_df['Missing Values'] > 0])

# Display categorical cardinalities
cat_cols = podcast_train.select_dtypes(include=['object', 'category']).columns.tolist()
cardinality = podcast_train[cat_cols].nunique().sort_values(ascending=False)
print("Training Data Categorical Cardinalities:")
display(cardinality.to_frame('nunique'))

Training Data Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB

None

Training Data Statistical Summary:

Training Data Missing Values Summary:

Training Data Categorical Cardinalities:

# Set Up Target, Numeric and Categorical Columns
target_col = 'Listening_Time_minutes'

num_cols = (
    podcast_train.select_dtypes(include=[np.number])
    .columns.drop(['id'], errors='ignore')
    .tolist()
)
cat_cols = (
    podcast_train.select_dtypes(include=['object', 'category'])
    .columns.drop(['id'], errors='ignore')
    .tolist()
)

# Ensure target exists and is numeric
assert target_col in podcast_train.columns, f"Missing {target_col} in train."

# Listening Time distribution
plt.figure(figsize=(7,3))
sns.histplot(podcast_train[target_col], bins=50, kde=True, color='purple')
plt.title('Majority of Listening Time is between 20-60 minutes')
plt.xlabel('minutes'); plt.ylabel('count')
plt.show()

# Top numeric correlations with Listening Time
num_for_corr = [c for c in num_cols if c != target_col]

corr_to_target = (
    podcast_train[num_for_corr + [target_col]].corr()[target_col]
    .drop(labels=[target_col])
    .sort_values(key=np.abs, ascending=False)
)

top_feats = corr_to_target.head(8).index.tolist()
top_num = top_feats[:4]

# Correlation matrix for target + top features
corr_mat = podcast_train[[target_col] + top_feats].corr()

# Heatmap
plt.figure(figsize=(7,5))
sns.heatmap(corr_mat, annot=True, fmt=".2f", cmap="Purples", square=True, vmin=-1, vmax=1)
plt.title(f"Episode Length has the most correlations with Listening Time")
plt.show()

# Scatter plots: top numeric features vs Listening Time
sample = podcast_train.sample(min(100_000, len(podcast_train)), random_state=51)
n = len(top_num)
plt.figure(figsize=(10, 3*n))

for i, col in enumerate(top_num, 1):
    plt.subplot(n, 1, i)
    sns.scatterplot(data=sample, x=col, y=target_col, alpha=0.25, color='purple')
    plt.title(f'{target_col} vs {col}')
    plt.xlabel(col); plt.ylabel(target_col)
plt.tight_layout(); plt.show()

# Categorical Cardinality and Target-by-Category Bar Charts
if cat_cols:

    # Choose low-cardinality columns for bar charts (<= 20 unique values)
    low_card = [c for c in cat_cols if podcast_train[c].nunique() <= 20]
    # Prefer common interpretable columns if present
    preferred = [c for c in ['Genre','Publication_Day','Episode_Sentiment'] if c in low_card]
    plot_cols = preferred[:3] if preferred else low_card[:3]

    for col in plot_cols:
        order = podcast_train[col].value_counts().index
        agg = podcast_train.groupby(col, as_index=False)[target_col].mean()
        plt.figure(figsize=(8,3))
        sns.barplot(data=agg, x=col, y=target_col, order=order, color='purple')
        plt.title(f'Average {target_col} by {col}')
        plt.xlabel(col); plt.ylabel(f'avg {target_col}')
        plt.xticks(rotation=30, ha='right')
        plt.show()

    # For high-card columns show top counts only
    high_card = [c for c in cat_cols if podcast_train[c].nunique() > 20]
    for col in high_card[:1]:  # one chart
        top_counts = podcast_train[col].value_counts().head(10)
        plt.figure(figsize=(8,3))
        sns.barplot(x=top_counts.values, y=top_counts.index, color='purple')
        plt.title(f'Top 10 {col} (by count)')
        plt.xlabel('count'); plt.ylabel(col)
        plt.show()

# Train vs Test: numeric distribution check (robust to outliers; plotting only)
num_compare = [c for c in ['Episode_Length_minutes','Host_Popularity_percentage',
                           'Guest_Popularity_percentage','Number_of_Ads'] if c in num_cols]
n = len(num_compare)

if n > 0:
    q_low, q_high = 0.005, 0.995  # clip to 0.5%–99.5% for visualization
    plt.figure(figsize=(10, 2.6*n))
    for i, col in enumerate(num_compare, 1):
        plt.subplot(n, 1, i)

        lo = podcast_train[col].quantile(q_low)
        hi = podcast_train[col].quantile(q_high)

        # Clip both train and test to the same bounds so shapes are comparable
        tr = podcast_train[col].clip(lo, hi)
        te = podcast_test[col].clip(lo, hi)

        sns.kdeplot(tr, label='train', fill=True, alpha=0.35, color='purple')
        sns.kdeplot(te, label='test',  fill=True, alpha=0.35, color='gray')

        # Small note on how many values were outside (for context)
        tr_out = int(((podcast_train[col] < lo) | (podcast_train[col] > hi)).sum())
        te_out = int(((podcast_test[col]  < lo) | (podcast_test[col]  > hi)).sum())
        plt.title(f'{col}: Train vs Test (clipped to {int(q_low*100)}-{int(q_high*100)} percentile)')
        plt.xlabel(col); plt.ylabel('density'); plt.legend()
        plt.text(0.99, 0.85, f'clipped train={tr_out}, test={te_out}', transform=plt.gca().transAxes,
                 ha='right', fontsize=8, color='dimgray')

    plt.tight_layout(); plt.show()

# Make a copy and Drop the 'id' column
test_ids = podcast_test['id'].copy() if 'id' in podcast_test.columns else None
podcast_train.drop(columns=['id'], inplace=True, errors='ignore')
podcast_test.drop(columns=['id'], inplace=True, errors='ignore')

# Find columns present in both train and test
common_cols = podcast_train.columns.intersection(podcast_test.columns)

# Split columns by type
# Numerical
numerical_cols = podcast_train[common_cols].select_dtypes(include=[np.number]).columns

# Categorical
categorical_cols = podcast_train[common_cols].select_dtypes(include=['object', 'category']).columns

# Handle missing values in numerical columns
# For both train and test, fill missing values with the median calculated from the train set
train_medians = podcast_train[numerical_cols].median()
podcast_train[numerical_cols] = podcast_train[numerical_cols].fillna(train_medians)
podcast_test[numerical_cols] = podcast_test[numerical_cols].fillna(train_medians)

# Impute categorical values
if len(categorical_cols) > 0:
    train_modes = podcast_train[categorical_cols].mode(dropna=True)
    train_modes = train_modes.iloc[0] if not train_modes.empty else pd.Series(index=categorical_cols)
    for col in categorical_cols:
        mode_val = train_modes.get(col, None)
        if mode_val is not None:
            podcast_train[col] = podcast_train[col].fillna(mode_val)
            podcast_test[col] = podcast_test[col].fillna(mode_val)

target_col = 'Listening_Time_minutes'
assert target_col not in common_cols, "Target must not be in feature columns."

print("Test numeric missing values:")
print(podcast_test[numerical_cols].isnull().sum())
print("\nTest categorical missing values:")
print(podcast_test[categorical_cols].isnull().sum())
print("\nFeature total missing values:\nTrain:",
      int(podcast_train[common_cols].isna().sum().sum()),
      "| Test:",
      int(podcast_test[common_cols].isna().sum().sum()))

Test numeric missing values:
Episode_Length_minutes         0
Host_Popularity_percentage     0
Guest_Popularity_percentage    0
Number_of_Ads                  0
dtype: int64

Test categorical missing values:
Podcast_Name         0
Episode_Title        0
Genre                0
Publication_Day      0
Publication_Time     0
Episode_Sentiment    0
dtype: int64

Feature total missing values:
Train: 0 | Test: 0

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

# Define target variable
target_col = "Listening_Time_minutes"

X = podcast_train.drop(columns=[target_col])
y = podcast_train[target_col]

# 80/20 split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=51
)

# Predict the training mean on validation
mean_value = y_train.mean()
y_pred = np.full(shape=len(y_val), fill_value=mean_value)

rmse_baseline = root_mean_squared_error(y_val, y_pred)
print(f"Baseline mean predictor RMSE: {rmse_baseline:.2f}")

Baseline mean predictor RMSE: 27.09

# Can be called on all the below models
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
X_train_enc = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_val_enc = pd.get_dummies(X_val, columns=cat_cols, drop_first=True)
X_val_enc = X_val_enc.reindex(columns=X_train_enc.columns, fill_value=0)

# max_depth and min_samples_leaf values to cycle through
depth_list = [6, 8, 10, 12, 14, None]
leaf_list = [5, 10, 20, 40]

best_rmse = None
best_params = None

# Looping through the above values to find the best RMSE
print("Decision Tree search:")
for d in depth_list:
    for leaf in leaf_list:
        model = DecisionTreeRegressor(
            random_state=51,
            max_depth=d,
            min_samples_leaf=leaf
        )
        model.fit(X_train_enc, y_train)
        preds = model.predict(X_val_enc)
        rmse = root_mean_squared_error(y_val, preds)
        print(f"max_depth={d} | min_samples_leaf={leaf} -> RMSE: {rmse:.2f}")
        if (best_rmse is None) or (rmse < best_rmse):
            best_rmse = rmse
            best_params = (d, leaf)

print("\nBest params:",
      f"max_depth={best_params[0]}, min_samples_leaf={best_params[1]} -> RMSE: {best_rmse:.2f}")

# Fit final DT on the best params
dt_model = DecisionTreeRegressor(
    random_state=51,
    max_depth=best_params[0],
    min_samples_leaf=best_params[1]
)
dt_model.fit(X_train_enc, y_train)
val_preds = dt_model.predict(X_val_enc)
rmse_dt = root_mean_squared_error(y_val, val_preds)
print(f"\nFinal Decision Tree RMSE: {rmse_dt:.2f}")

Decision Tree search:
max_depth=6 | min_samples_leaf=5 -> RMSE: 13.30
max_depth=6 | min_samples_leaf=10 -> RMSE: 13.30
max_depth=6 | min_samples_leaf=20 -> RMSE: 13.30
max_depth=6 | min_samples_leaf=40 -> RMSE: 13.30
max_depth=8 | min_samples_leaf=5 -> RMSE: 13.23
max_depth=8 | min_samples_leaf=10 -> RMSE: 13.23
max_depth=8 | min_samples_leaf=20 -> RMSE: 13.23
max_depth=8 | min_samples_leaf=40 -> RMSE: 13.23
max_depth=10 | min_samples_leaf=5 -> RMSE: 13.21
max_depth=10 | min_samples_leaf=10 -> RMSE: 13.21
max_depth=10 | min_samples_leaf=20 -> RMSE: 13.20
max_depth=10 | min_samples_leaf=40 -> RMSE: 13.20
max_depth=12 | min_samples_leaf=5 -> RMSE: 13.23
max_depth=12 | min_samples_leaf=10 -> RMSE: 13.23
max_depth=12 | min_samples_leaf=20 -> RMSE: 13.22
max_depth=12 | min_samples_leaf=40 -> RMSE: 13.22
max_depth=14 | min_samples_leaf=5 -> RMSE: 13.31
max_depth=14 | min_samples_leaf=10 -> RMSE: 13.30
max_depth=14 | min_samples_leaf=20 -> RMSE: 13.27
max_depth=14 | min_samples_leaf=40 -> RMSE: 13.25
max_depth=None | min_samples_leaf=5 -> RMSE: 15.27
max_depth=None | min_samples_leaf=10 -> RMSE: 14.31
max_depth=None | min_samples_leaf=20 -> RMSE: 13.72
max_depth=None | min_samples_leaf=40 -> RMSE: 13.44

Best params: max_depth=10, min_samples_leaf=40 -> RMSE: 13.20

Final Decision Tree RMSE: 13.20

# Defining the hyperparameters to cycle through
n_estimators_list = [120, 240]
max_depth_list = [10, 16]
min_samples_leaf_list = [5, 15]

best_rf_rmse = None
best_rf_params = None  # (n_estimators, max_depth, min_samples_leaf)

# Looping through the hyperparameter values to find the best RMSE
print("Random Forest search:")
for n in n_estimators_list:
    for d in max_depth_list:
        for leaf in min_samples_leaf_list:
            rf = RandomForestRegressor(
                n_estimators=n,
                max_depth=d,
                min_samples_leaf=leaf,
                max_features='sqrt',
                random_state=51,
                n_jobs=-1
            )
            rf.fit(X_train_enc, y_train)
            preds = rf.predict(X_val_enc)
            rmse = root_mean_squared_error(y_val, preds)
            print(f"n_estimators={n} | max_depth={d} | min_samples_leaf={leaf} -> RMSE: {rmse:.2f}")
            if (best_rf_rmse is None) or (rmse < best_rf_rmse):
                best_rf_rmse = rmse
                best_rf_params = (n, d, leaf)

print("\nBest RF params:",
      f"n_estimators={best_rf_params[0]}, max_depth={best_rf_params[1]},",
      f"min_samples_leaf={best_rf_params[2]} -> RMSE: {best_rf_rmse:.2f}")

# Fit final RF on best params
rf_model = RandomForestRegressor(
    n_estimators=best_rf_params[0],
    max_depth=best_rf_params[1],
    min_samples_leaf=best_rf_params[2],
    max_features='sqrt',
    random_state=51,
    n_jobs=-1
).fit(X_train_enc, y_train)

val_preds_rf = rf_model.predict(X_val_enc)
rmse_rf = root_mean_squared_error(y_val, val_preds_rf)
print(f"\nFinal Random Forest RMSE: {rmse_rf:.2f}")

Random Forest search:
n_estimators=120 | max_depth=10 | min_samples_leaf=5 -> RMSE: 18.87
n_estimators=120 | max_depth=10 | min_samples_leaf=15 -> RMSE: 19.07
n_estimators=120 | max_depth=16 | min_samples_leaf=5 -> RMSE: 16.74
n_estimators=120 | max_depth=16 | min_samples_leaf=15 -> RMSE: 16.65
n_estimators=240 | max_depth=10 | min_samples_leaf=5 -> RMSE: 19.03
n_estimators=240 | max_depth=10 | min_samples_leaf=15 -> RMSE: 19.41
n_estimators=240 | max_depth=16 | min_samples_leaf=5 -> RMSE: 16.78
n_estimators=240 | max_depth=16 | min_samples_leaf=15 -> RMSE: 16.76

Best RF params: n_estimators=120, max_depth=16, min_samples_leaf=15 -> RMSE: 16.65

Final Random Forest RMSE: 16.65

# Defining the hyperparameters to cycle through
hgb_learning_rates = [0.05, 0.10]
hgb_max_depth_list = [4, 6]          
hgb_max_iter_list = [120, 160]                              

best_hgb_rmse = None
best_hgb_params = None  # (lr, depth, max_iter)

# Looping through the hyperparameter values to find the best RMSE
print("Histogram Gradient Boosting search:")
for lr in hgb_learning_rates:
    for md in hgb_max_depth_list:
        for it in hgb_max_iter_list:
            hgb = HistGradientBoostingRegressor(
                learning_rate=lr,
                max_depth=md,
                max_iter=it,
                min_samples_leaf=20,
                l2_regularization=0.0,
                early_stopping=True,          
                validation_fraction=0.1,
                n_iter_no_change=10,
                random_state=51
            )
            hgb.fit(X_train_enc, y_train)
            preds = hgb.predict(X_val_enc)
            rmse = root_mean_squared_error(y_val, preds)
            print(f"learning_rate={lr} | max_depth={md} | max_iter={it} -> RMSE: {rmse:.2f}")
            if (best_hgb_rmse is None) or (rmse < best_hgb_rmse):
                best_hgb_rmse = rmse
                best_hgb_params = (lr, md, it)

print("\nBest HGB params:",
      f"learning_rate={best_hgb_params[0]}, max_depth={best_hgb_params[1]},",
      f"max_iter={best_hgb_params[2]}, min_samples_leaf={20} -> RMSE: {best_hgb_rmse:.2f}")

# Fit final HGB model on best params
hgb_model = HistGradientBoostingRegressor(
    learning_rate=best_hgb_params[0],
    max_depth=best_hgb_params[1],
    max_iter=best_hgb_params[2],
    min_samples_leaf=20,
    l2_regularization=0.0,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=51
).fit(X_train_enc, y_train)

val_preds_hgb = hgb_model.predict(X_val_enc)
rmse_hgb = root_mean_squared_error(y_val, val_preds_hgb)
print(f"\nFinal Histogram Gradient Boosting RMSE: {rmse_hgb:.2f}")

Histogram Gradient Boosting search:
learning_rate=0.05 | max_depth=4 | max_iter=120 -> RMSE: 13.19
learning_rate=0.05 | max_depth=4 | max_iter=160 -> RMSE: 13.18
learning_rate=0.05 | max_depth=6 | max_iter=120 -> RMSE: 13.15
learning_rate=0.05 | max_depth=6 | max_iter=160 -> RMSE: 13.14
learning_rate=0.1 | max_depth=4 | max_iter=120 -> RMSE: 13.16
learning_rate=0.1 | max_depth=4 | max_iter=160 -> RMSE: 13.15
learning_rate=0.1 | max_depth=6 | max_iter=120 -> RMSE: 13.12
learning_rate=0.1 | max_depth=6 | max_iter=160 -> RMSE: 13.11

Best HGB params: learning_rate=0.1, max_depth=6, max_iter=160, min_samples_leaf=20 -> RMSE: 13.11

Final Histogram Gradient Boosting RMSE: 13.11

rmse_values = {
    "Baseline": rmse_baseline,
    "Decision Tree": rmse_dt,
    "Random Forest": rmse_rf,
    "HistGradientBoosting": rmse_hgb
}

# Sort ascending
items = sorted(rmse_values.items(), key=lambda x: x[1])
labels = [m for m,_ in items]
scores = [s for _,s in items]

plt.figure(figsize=(5,3.2))
bars = plt.barh(labels, scores, color="#7b1fa2")

# Highlight best at the top and visually distinguish the baseline
best_index = 0
baseline_index = labels.index("Baseline")
bars[best_index].set_color("#00897b")
bars[baseline_index].set_hatch("//")

for b, s in zip(bars, scores):
    plt.text(s + 0.1, b.get_y() + b.get_height()/2, f"{s:.2f}", va='center', fontsize=9)

plt.xlabel("RMSE (lower is better)")
plt.title("HistGradientBoosting more than halves the Baseline RMSE")
plt.xlim(0, max(scores)*1.15)
plt.gca().invert_yaxis()  
plt.grid(axis='x', alpha=0.2)
plt.tight_layout()
plt.show()

# Evaluate model
r2 = r2_score(y_val, val_preds_hgb)          
print(f"Validation R²: {r2*100:.1f}%")

# Calculate RMSE improvement
improve_pct = (rmse_baseline - rmse_hgb) / rmse_baseline * 100
print(f"RMSE improvement vs baseline: {improve_pct:.1f}%")

Validation R²: 76.6%
RMSE improvement vs baseline: 51.6%

# Prepare full feature matrix and target
X = podcast_train.drop(columns=[target_col])
y = podcast_train[target_col]

# Identify categorical columns and one-hot encode
cat_cols_full = X.select_dtypes(include=['object', 'category']).columns.tolist()
X_enc = pd.get_dummies(X, columns=cat_cols_full, drop_first=True)

# Best HGB params: learning_rate=0.1, max_depth=6, max_iter=160, min_samples_leaf=20
print(f"Training final HistGradientBoosting on full data: "
      f"learning_rate={0.1}, max_depth={6}, max_iter={160}")

hgb_train = HistGradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=6,
    max_iter=160,
    min_samples_leaf=20,
    l2_regularization=0.0,
    early_stopping=False,
    random_state=51
).fit(X_enc, y)

# Compare full train dataset to validation RMSE
train_preds = hgb_train.predict(X_enc)
train_rmse = root_mean_squared_error(y, train_preds)
print("Training on Full Train Dataset Complete:")
print(f"Full Train Dataset RMSE: {train_rmse:.2f}")

Training final HistGradientBoosting on full data: learning_rate=0.1, max_depth=6, max_iter=160
Training on Full Train Dataset Complete:
Full Train Dataset RMSE: 13.03

# Build encoded test feature matrix
test_features = podcast_test[X.columns]
test_enc = pd.get_dummies(test_features, columns=cat_cols_full, drop_first=True)

# Align columns (add any missing, drop extras)
test_enc = test_enc.reindex(columns=X_enc.columns, fill_value=0)

# Predict Listening Time
test_preds = hgb_train.predict(test_enc)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,                       
    'Listening_Time_minutes': test_preds
}).round(2)

# Output Submission to a CSV file
submission.to_csv("submission.csv", index=False)
print(f"Submission file written: submission.csv")

# Output Submission DataFrame Head
print("\nSubmission DataFrame Preview:")
submission.head()

Submission file written: submission.csv

Submission DataFrame Preview:

	id	Podcast_Name	Episode_Title	Episode_Length_minutes	Genre	Host_Popularity_percentage	Publication_Day	Publication_Time	Guest_Popularity_percentage	Number_of_Ads	Episode_Sentiment	Listening_Time_minutes
0	0	Mystery Matters	Episode 98	NaN	True Crime	74.81	Thursday	Night	NaN	0.0	Positive	31.41998
1	1	Joke Junction	Episode 26	119.80	Comedy	66.95	Saturday	Afternoon	75.95	2.0	Negative	88.01241
2	2	Study Sessions	Episode 16	73.90	Education	69.97	Tuesday	Evening	8.97	0.0	Negative	44.92531
3	3	Digital Digest	Episode 45	67.17	Technology	57.22	Monday	Morning	78.70	2.0	Positive	46.27824
4	4	Mind & Body	Episode 86	110.51	Health	80.07	Monday	Afternoon	58.68	3.0	Neutral	75.61031

	id	Podcast_Name	Episode_Title	Episode_Length_minutes	Genre	Host_Popularity_percentage	Publication_Day	Publication_Time	Guest_Popularity_percentage	Number_of_Ads	Episode_Sentiment
0	750000	Educational Nuggets	Episode 73	78.96	Education	38.11	Saturday	Evening	53.33	1.0	Neutral
1	750001	Sound Waves	Episode 23	27.87	Music	71.29	Sunday	Morning	NaN	0.0	Neutral
2	750002	Joke Junction	Episode 11	69.10	Comedy	67.89	Friday	Evening	97.51	0.0	Positive
3	750003	Comedy Corner	Episode 73	115.39	Comedy	23.40	Sunday	Morning	51.75	2.0	Positive
4	750004	Life Lessons	Episode 50	72.32	Lifestyle	58.10	Wednesday	Morning	11.30	2.0	Neutral

	id	Listening_Time_minutes
0	750000	45.437
1	750001	45.437
2	750002	45.437
3	750003	45.437
4	750004	45.437

	id	Episode_Length_minutes	Host_Popularity_percentage	Guest_Popularity_percentage	Number_of_Ads	Listening_Time_minutes
count	750000.00	662907.00	750000.00	603970.00	749999.00	750000.00
mean	374999.50	64.50	59.86	52.24	1.35	45.44
std	216506.50	32.97	22.87	28.45	1.15	27.14
min	0.00	0.00	1.30	0.00	0.00	0.00
25%	187499.75	35.73	39.41	28.38	0.00	23.18
50%	374999.50	63.84	60.05	53.58	1.00	43.38
75%	562499.25	94.07	79.53	76.60	2.00	64.81
max	749999.00	325.24	119.46	119.91	103.91	119.97

	id	Listening_Time_minutes
0	750000	56.10
1	750001	18.00
2	750002	49.26
3	750003	79.93
4	750004	48.86

Predict Podcast Listening Time 🎙️¶

Aishwarya Singh¶

Professional Certificate in Data Analytics
Imperial College Business School

Dataset Description 📋¶

Data Files 📁¶

Submission Criteria ✅¶

Submission File¶

Import Basic Libraries ⬇️¶

Importing and Dataset Overview 🔎¶

Exploratory Data Analysis 📊¶

Data Cleaning and Preperation 🧹¶

Cleaning Numerical and Categorical Columns¶

Final Checks before Modelling¶

Modelling the Data 🛠️¶

Import Model Libraries ⬇️¶

Define the Baseline RMSE¶

Encode Categorical Values¶

Decison Tree Regressor 🌲¶

Random Forest Regressor 🌲🌲🌲¶

Histogram Gradient Boosting Regressor 🌈¶

Model Comparison 📊¶

Evaluating the Performance of the HGB Model¶

Training the Full Train Dataset Using HistGradientBoosting¶

Fitting the Model on the Test Dataset¶

	Missing Values	Percent
Guest_Popularity_percentage	146030	19.47
Episode_Length_minutes	87093	11.61
Number_of_Ads	1	0.00

Predict Podcast Listening Time 🎙️¶

Aishwarya Singh¶

Professional Certificate in Data AnalyticsImperial College Business School

Dataset Description 📋¶

Data Files 📁¶

Submission Criteria ✅¶

Submission File¶

Import Basic Libraries ⬇️¶

Importing and Dataset Overview 🔎¶

Exploratory Data Analysis 📊¶

Data Cleaning and Preperation 🧹¶

Cleaning Numerical and Categorical Columns¶

Final Checks before Modelling¶

Modelling the Data 🛠️¶

Import Model Libraries ⬇️¶

Define the Baseline RMSE¶

Encode Categorical Values¶

Decison Tree Regressor 🌲¶

Random Forest Regressor 🌲🌲🌲¶

Histogram Gradient Boosting Regressor 🌈¶

Model Comparison 📊¶

Evaluating the Performance of the HGB Model¶

Training the Full Train Dataset Using HistGradientBoosting¶

Fitting the Model on the Test Dataset¶

Professional Certificate in Data Analytics
Imperial College Business School