import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#data and code reference: https://www.kaggle.com/learn/feature-engineering
#loading data
ks = pd.read_csv('ks-projects-201801.csv',
parse_dates=['deadline', 'launched'])
ks.head()
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | usd_pledged_real | usd_goal_real | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000002330 | The Songs of Adelaide & Abullah | Poetry | Publishing | GBP | 2015-10-09 | 1000.0 | 2015-08-11 12:12:28 | 0.0 | failed | 0 | GB | 0.0 | 0.0 | 1533.95 |
| 1 | 1000003930 | Greeting From Earth: ZGAC Arts Capsule For ET | Narrative Film | Film & Video | USD | 2017-11-01 | 30000.0 | 2017-09-02 04:43:57 | 2421.0 | failed | 15 | US | 100.0 | 2421.0 | 30000.00 |
| 2 | 1000004038 | Where is Hank? | Narrative Film | Film & Video | USD | 2013-02-26 | 45000.0 | 2013-01-12 00:20:50 | 220.0 | failed | 3 | US | 220.0 | 220.0 | 45000.00 |
| 3 | 1000007540 | ToshiCapital Rekordz Needs Help to Complete Album | Music | Music | USD | 2012-04-16 | 5000.0 | 2012-03-17 03:24:11 | 1.0 | failed | 1 | US | 1.0 | 1.0 | 5000.00 |
| 4 | 1000011046 | Community Film Project: The Art of Neighborhoo... | Film & Video | Film & Video | USD | 2015-08-29 | 19500.0 | 2015-07-04 08:35:03 | 1283.0 | canceled | 14 | US | 1283.0 | 1283.0 | 19500.00 |
print('Unique values in `state` column:', list(ks.state.unique()))
Unique values in `state` column: ['failed', 'canceled', 'successful', 'live', 'undefined', 'suspended']
# Drop live projects
ks = ks.query('state != "live"')
# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))
ks = ks.assign(hour=ks.launched.dt.hour,
day=ks.launched.dt.day,
month=ks.launched.dt.month,
year=ks.launched.dt.year)
from sklearn.preprocessing import LabelEncoder
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()
# Apply the label encoder to each column
encoded = ks[cat_features].apply(encoder.fit_transform)
# Since ks and encoded have the same index and I can easily join them
data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)
data.head()
| goal | hour | day | month | year | outcome | category | currency | country | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000.0 | 12 | 11 | 8 | 2015 | 0 | 108 | 5 | 9 |
| 1 | 30000.0 | 4 | 2 | 9 | 2017 | 0 | 93 | 13 | 22 |
| 2 | 45000.0 | 0 | 12 | 1 | 2013 | 0 | 93 | 13 | 22 |
| 3 | 5000.0 | 3 | 17 | 3 | 2012 | 0 | 90 | 13 | 22 |
| 4 | 19500.0 | 8 | 4 | 7 | 2015 | 0 | 55 | 13 | 22 |
#splitting
valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)
train = data[:-2 * valid_size]
valid = data[-2 * valid_size:-valid_size]
test = data[-valid_size:]
#basline model
import lightgbm as lgb
feature_cols = train.columns.drop('outcome')
dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])
param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False)
from sklearn import metrics
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['outcome'], ypred)
print(f"Test AUC score: {score}")
Test AUC score: 0.747615303004287
#interactions
interactions = ks['category'] + "_" + ks['country']
print(interactions.head())
0 Poetry_GB 1 Narrative Film_US 2 Narrative Film_US 3 Music_US 4 Film & Video_US dtype: object
def time_since_last_project(series):
# Return the time in hours
return series.diff().dt.total_seconds() / 3600.
df = ks[['category', 'launched']].sort_values('launched')
timedeltas = df.groupby('category').transform(time_since_last_project)
timedeltas.head(20)
| launched | |
|---|---|
| 94579 | NaN |
| 319002 | NaN |
| 247913 | NaN |
| 48147 | NaN |
| 75397 | NaN |
| 2842 | 0.000000 |
| 273779 | NaN |
| 169268 | NaN |
| 322000 | NaN |
| 138572 | NaN |
| 325391 | NaN |
| 122662 | 137.130833 |
| 213711 | NaN |
| 345606 | 145.941111 |
| 235255 | NaN |
| 98954 | 344715.626944 |
| 342226 | NaN |
| 275091 | NaN |
| 284115 | NaN |
| 32898 | NaN |
#Numerical transformation example
#binned data
plt.hist(ks.goal, range=(0, 100000), bins=50);
plt.title('Goal');
#sqrt transformation
plt.hist(np.sqrt(ks.goal), range=(0, 400), bins=50);
plt.title('Sqrt(Goal)');
#log transformation
plt.hist(np.log(ks.goal), range=(0, 25), bins=50);
plt.title('Log(Goal)');