Revisão | 372bf58aa1eba13e378314c8b9d343fd7fd4210b (tree) |
---|---|
Hora | 2013-06-25 02:04:38 |
Autor | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
A code to carry out logistic regression in Python.
I still need to read and understand it and it may be possible that it can
be sped up/improved.
@@ -0,0 +1,164 @@ | ||
1 | +__author__ = 'Miroslaw Horbal' | |
2 | +__email__ = 'miroslaw@gmail.com' | |
3 | +__date__ = '14-06-2013' | |
4 | + | |
5 | +from numpy import array, hstack | |
6 | +from sklearn import metrics, cross_validation, linear_model | |
7 | +from scipy import sparse | |
8 | +from itertools import combinations | |
9 | + | |
10 | +import numpy as np | |
11 | +import pandas as pd | |
12 | + | |
13 | +SEED = 25 | |
14 | + | |
15 | +def group_data(data, degree=3, hash=hash): | |
16 | + """ | |
17 | + numpy.array -> numpy.array | |
18 | + | |
19 | + Groups all columns of data into all combinations of triples | |
20 | + """ | |
21 | + new_data = [] | |
22 | + m,n = data.shape | |
23 | + for indicies in combinations(range(n), degree): | |
24 | + new_data.append([hash(tuple(v)) for v in data[:,indicies]]) | |
25 | + return array(new_data).T | |
26 | + | |
27 | +def OneHotEncoder(data, keymap=None): | |
28 | + """ | |
29 | + OneHotEncoder takes data matrix with categorical columns and | |
30 | + converts it to a sparse binary matrix. | |
31 | + | |
32 | + Returns sparse binary matrix and keymap mapping categories to indicies. | |
33 | + If a keymap is supplied on input it will be used instead of creating one | |
34 | + and any categories appearing in the data that are not in the keymap are | |
35 | + ignored | |
36 | + """ | |
37 | + if keymap is None: | |
38 | + keymap = [] | |
39 | + for col in data.T: | |
40 | + uniques = set(list(col)) | |
41 | + keymap.append(dict((key, i) for i, key in enumerate(uniques))) | |
42 | + total_pts = data.shape[0] | |
43 | + outdat = [] | |
44 | + for i, col in enumerate(data.T): | |
45 | + km = keymap[i] | |
46 | + num_labels = len(km) | |
47 | + spmat = sparse.lil_matrix((total_pts, num_labels)) | |
48 | + for j, val in enumerate(col): | |
49 | + if val in km: | |
50 | + spmat[j, km[val]] = 1 | |
51 | + outdat.append(spmat) | |
52 | + outdat = sparse.hstack(outdat).tocsr() | |
53 | + return outdat, keymap | |
54 | + | |
55 | +def create_test_submission(filename, prediction): | |
56 | + content = ['id,ACTION'] | |
57 | + for i, p in enumerate(prediction): | |
58 | + content.append('%i,%f' %(i+1,p)) | |
59 | + f = open(filename, 'w') | |
60 | + f.write('\n'.join(content)) | |
61 | + f.close() | |
62 | + print 'Saved' | |
63 | + | |
64 | +# This loop essentially from Paul's starter code | |
65 | +def cv_loop(X, y, model, N): | |
66 | + mean_auc = 0. | |
67 | + for i in range(N): | |
68 | + X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( | |
69 | + X, y, test_size=.20, | |
70 | + random_state = i*SEED) | |
71 | + model.fit(X_train, y_train) | |
72 | + preds = model.predict_proba(X_cv)[:,1] | |
73 | + auc = metrics.auc_score(y_cv, preds) | |
74 | + print "AUC (fold %d/%d): %f" % (i + 1, N, auc) | |
75 | + mean_auc += auc | |
76 | + return mean_auc/N | |
77 | + | |
78 | +def main(train='train.csv', test='test.csv', submit='logistic_pred.csv'): | |
79 | + print "Reading dataset..." | |
80 | + train_data = pd.read_csv(train) | |
81 | + test_data = pd.read_csv(test) | |
82 | + all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1])) | |
83 | + | |
84 | + num_train = np.shape(train_data)[0] | |
85 | + | |
86 | + # Transform data | |
87 | + print "Transforming data..." | |
88 | + dp = group_data(all_data, degree=2) | |
89 | + dt = group_data(all_data, degree=3) | |
90 | + | |
91 | + y = array(train_data.ACTION) | |
92 | + X = all_data[:num_train] | |
93 | + X_2 = dp[:num_train] | |
94 | + X_3 = dt[:num_train] | |
95 | + | |
96 | + X_test = all_data[num_train:] | |
97 | + X_test_2 = dp[num_train:] | |
98 | + X_test_3 = dt[num_train:] | |
99 | + | |
100 | + X_train_all = np.hstack((X, X_2, X_3)) | |
101 | + X_test_all = np.hstack((X_test, X_test_2, X_test_3)) | |
102 | + num_features = X_train_all.shape[1] | |
103 | + | |
104 | + model = linear_model.LogisticRegression() | |
105 | + | |
106 | + # Xts holds one hot encodings for each individual feature in memory | |
107 | + # speeding up feature selection | |
108 | + Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)] | |
109 | + | |
110 | + print "Performing greedy feature selection..." | |
111 | + score_hist = [] | |
112 | + N = 10 | |
113 | + good_features = set([]) | |
114 | + # Greedy feature selection loop | |
115 | + while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: | |
116 | + scores = [] | |
117 | + for f in range(len(Xts)): | |
118 | + if f not in good_features: | |
119 | + feats = list(good_features) + [f] | |
120 | + Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() | |
121 | + score = cv_loop(Xt, y, model, N) | |
122 | + scores.append((score, f)) | |
123 | + print "Feature: %i Mean AUC: %f" % (f, score) | |
124 | + good_features.add(sorted(scores)[-1][1]) | |
125 | + score_hist.append(sorted(scores)[-1]) | |
126 | + print "Current features: %s" % sorted(list(good_features)) | |
127 | + | |
128 | + # Remove last added feature from good_features | |
129 | + good_features.remove(score_hist[-1][1]) | |
130 | + good_features = sorted(list(good_features)) | |
131 | + print "Selected features %s" % good_features | |
132 | + | |
133 | + print "Performing hyperparameter selection..." | |
134 | + # Hyperparameter selection loop | |
135 | + score_hist = [] | |
136 | + Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr() | |
137 | + Cvals = np.logspace(-4, 4, 15, base=2) | |
138 | + for C in Cvals: | |
139 | + model.C = C | |
140 | + score = cv_loop(Xt, y, model, N) | |
141 | + score_hist.append((score,C)) | |
142 | + print "C: %f Mean AUC: %f" %(C, score) | |
143 | + bestC = sorted(score_hist)[-1][1] | |
144 | + print "Best C value: %f" % (bestC) | |
145 | + | |
146 | + print "Performing One Hot Encoding on entire dataset..." | |
147 | + Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features])) | |
148 | + Xt, keymap = OneHotEncoder(Xt) | |
149 | + X_train = Xt[:num_train] | |
150 | + X_test = Xt[num_train:] | |
151 | + | |
152 | + print "Training full model..." | |
153 | + model.fit(X_train, y) | |
154 | + | |
155 | + print "Making prediction and saving results..." | |
156 | + preds = model.predict_proba(X_test)[:,1] | |
157 | + create_test_submission(submit, preds) | |
158 | + | |
159 | +if __name__ == "__main__": | |
160 | + args = { 'train': 'train.csv', | |
161 | + 'test': 'test.csv', | |
162 | + 'submit': 'logistic_regression_pred.csv' } | |
163 | + main(**args) | |
164 | + |