In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#data and code reference: https://www.kaggle.com/learn/feature-engineering
In [4]:
#loading data
ks = pd.read_csv('ks-projects-201801.csv',
                 parse_dates=['deadline', 'launched'])
ks.head()
Out[4]:
ID name category main_category currency deadline goal launched pledged state backers country usd pledged usd_pledged_real usd_goal_real
0 1000002330 The Songs of Adelaide & Abullah Poetry Publishing GBP 2015-10-09 1000.0 2015-08-11 12:12:28 0.0 failed 0 GB 0.0 0.0 1533.95
1 1000003930 Greeting From Earth: ZGAC Arts Capsule For ET Narrative Film Film & Video USD 2017-11-01 30000.0 2017-09-02 04:43:57 2421.0 failed 15 US 100.0 2421.0 30000.00
2 1000004038 Where is Hank? Narrative Film Film & Video USD 2013-02-26 45000.0 2013-01-12 00:20:50 220.0 failed 3 US 220.0 220.0 45000.00
3 1000007540 ToshiCapital Rekordz Needs Help to Complete Album Music Music USD 2012-04-16 5000.0 2012-03-17 03:24:11 1.0 failed 1 US 1.0 1.0 5000.00
4 1000011046 Community Film Project: The Art of Neighborhoo... Film & Video Film & Video USD 2015-08-29 19500.0 2015-07-04 08:35:03 1283.0 canceled 14 US 1283.0 1283.0 19500.00
In [6]:
print('Unique values in `state` column:', list(ks.state.unique()))
Unique values in `state` column: ['failed', 'canceled', 'successful', 'live', 'undefined', 'suspended']
In [7]:
# Drop live projects
ks = ks.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))
In [8]:
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)
In [9]:
from sklearn.preprocessing import LabelEncoder

cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()

# Apply the label encoder to each column
encoded = ks[cat_features].apply(encoder.fit_transform)
In [10]:
# Since ks and encoded have the same index and I can easily join them
data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)
data.head()
Out[10]:
goal hour day month year outcome category currency country
0 1000.0 12 11 8 2015 0 108 5 9
1 30000.0 4 2 9 2017 0 93 13 22
2 45000.0 0 12 1 2013 0 93 13 22
3 5000.0 3 17 3 2012 0 90 13 22
4 19500.0 8 4 7 2015 0 55 13 22
In [11]:
#splitting
valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)

train = data[:-2 * valid_size]
valid = data[-2 * valid_size:-valid_size]
test = data[-valid_size:]
In [13]:
#basline model
import lightgbm as lgb

feature_cols = train.columns.drop('outcome')

dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False)
In [14]:
from sklearn import metrics
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['outcome'], ypred)

print(f"Test AUC score: {score}")
Test AUC score: 0.747615303004287
In [18]:
#interactions
interactions = ks['category'] + "_" + ks['country']
print(interactions.head())
0            Poetry_GB
1    Narrative Film_US
2    Narrative Film_US
3             Music_US
4      Film & Video_US
dtype: object
In [19]:
def time_since_last_project(series):
    # Return the time in hours
    return series.diff().dt.total_seconds() / 3600.

df = ks[['category', 'launched']].sort_values('launched')
timedeltas = df.groupby('category').transform(time_since_last_project)
timedeltas.head(20)
Out[19]:
launched
94579 NaN
319002 NaN
247913 NaN
48147 NaN
75397 NaN
2842 0.000000
273779 NaN
169268 NaN
322000 NaN
138572 NaN
325391 NaN
122662 137.130833
213711 NaN
345606 145.941111
235255 NaN
98954 344715.626944
342226 NaN
275091 NaN
284115 NaN
32898 NaN
In [21]:
#Numerical transformation example
#binned data
plt.hist(ks.goal, range=(0, 100000), bins=50);
plt.title('Goal');
In [22]:
#sqrt transformation
plt.hist(np.sqrt(ks.goal), range=(0, 400), bins=50);
plt.title('Sqrt(Goal)');
In [23]:
#log transformation
plt.hist(np.log(ks.goal), range=(0, 25), bins=50);
plt.title('Log(Goal)');
In [ ]: