OSDN > Developer > larry77 > Chamber > myprojects-hg-reborn > Commit

myprojects-hg-reborn
Fork

(Original repository, No fork origin)

Commit

Commit MetaInfo

Revisão	372bf58aa1eba13e378314c8b9d343fd7fd4210b (tree)
Hora	2013-06-25 02:04:38
Autor	Lorenzo Isella <lorenzo.isella@gmai...>
Commiter	Lorenzo Isella

Mensagem de Log

A code to carry out logistic regression in Python.
I still need to read and understand it and it may be possible that it can
be sped up/improved.

Mudança Sumário

add: Python-codes/logistic_regression_updated.py (diff)

Diff

diff -r 90d01d56e457 -r 372bf58aa1eb Python-codes/logistic_regression_updated.py

--- /dev/null Thu Jan 01 00:00:00 1970 +0000

+++ b/Python-codes/logistic_regression_updated.py Mon Jun 24 19:04:38 2013 +0200

		@@ -0,0 +1,164 @@
	1	+__author__ = 'Miroslaw Horbal'
	2	+__email__ = 'miroslaw@gmail.com'
	3	+__date__ = '14-06-2013'
	4	+
	5	+from numpy import array, hstack
	6	+from sklearn import metrics, cross_validation, linear_model
	7	+from scipy import sparse
	8	+from itertools import combinations
	9	+
	10	+import numpy as np
	11	+import pandas as pd
	12	+
	13	+SEED = 25
	14	+
	15	+def group_data(data, degree=3, hash=hash):
	16	+ """
	17	+ numpy.array -> numpy.array
	18	+
	19	+ Groups all columns of data into all combinations of triples
	20	+ """
	21	+ new_data = []
	22	+ m,n = data.shape
	23	+ for indicies in combinations(range(n), degree):
	24	+ new_data.append([hash(tuple(v)) for v in data[:,indicies]])
	25	+ return array(new_data).T
	26	+
	27	+def OneHotEncoder(data, keymap=None):
	28	+ """
	29	+ OneHotEncoder takes data matrix with categorical columns and
	30	+ converts it to a sparse binary matrix.
	31	+
	32	+ Returns sparse binary matrix and keymap mapping categories to indicies.
	33	+ If a keymap is supplied on input it will be used instead of creating one
	34	+ and any categories appearing in the data that are not in the keymap are
	35	+ ignored
	36	+ """
	37	+ if keymap is None:
	38	+ keymap = []
	39	+ for col in data.T:
	40	+ uniques = set(list(col))
	41	+ keymap.append(dict((key, i) for i, key in enumerate(uniques)))
	42	+ total_pts = data.shape[0]
	43	+ outdat = []
	44	+ for i, col in enumerate(data.T):
	45	+ km = keymap[i]
	46	+ num_labels = len(km)
	47	+ spmat = sparse.lil_matrix((total_pts, num_labels))
	48	+ for j, val in enumerate(col):
	49	+ if val in km:
	50	+ spmat[j, km[val]] = 1
	51	+ outdat.append(spmat)
	52	+ outdat = sparse.hstack(outdat).tocsr()
	53	+ return outdat, keymap
	54	+
	55	+def create_test_submission(filename, prediction):
	56	+ content = ['id,ACTION']
	57	+ for i, p in enumerate(prediction):
	58	+ content.append('%i,%f' %(i+1,p))
	59	+ f = open(filename, 'w')
	60	+ f.write('\n'.join(content))
	61	+ f.close()
	62	+ print 'Saved'
	63	+
	64	+# This loop essentially from Paul's starter code
	65	+def cv_loop(X, y, model, N):
	66	+ mean_auc = 0.
	67	+ for i in range(N):
	68	+ X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
	69	+ X, y, test_size=.20,
	70	+ random_state = i*SEED)
	71	+ model.fit(X_train, y_train)
	72	+ preds = model.predict_proba(X_cv)[:,1]
	73	+ auc = metrics.auc_score(y_cv, preds)
	74	+ print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
	75	+ mean_auc += auc
	76	+ return mean_auc/N
	77	+
	78	+def main(train='train.csv', test='test.csv', submit='logistic_pred.csv'):
	79	+ print "Reading dataset..."
	80	+ train_data = pd.read_csv(train)
	81	+ test_data = pd.read_csv(test)
	82	+ all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
	83	+
	84	+ num_train = np.shape(train_data)[0]
	85	+
	86	+ # Transform data
	87	+ print "Transforming data..."
	88	+ dp = group_data(all_data, degree=2)
	89	+ dt = group_data(all_data, degree=3)
	90	+
	91	+ y = array(train_data.ACTION)
	92	+ X = all_data[:num_train]
	93	+ X_2 = dp[:num_train]
	94	+ X_3 = dt[:num_train]
	95	+
	96	+ X_test = all_data[num_train:]
	97	+ X_test_2 = dp[num_train:]
	98	+ X_test_3 = dt[num_train:]
	99	+
	100	+ X_train_all = np.hstack((X, X_2, X_3))
	101	+ X_test_all = np.hstack((X_test, X_test_2, X_test_3))
	102	+ num_features = X_train_all.shape[1]
	103	+
	104	+ model = linear_model.LogisticRegression()
	105	+
	106	+ # Xts holds one hot encodings for each individual feature in memory
	107	+ # speeding up feature selection
	108	+ Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]
	109	+
	110	+ print "Performing greedy feature selection..."
	111	+ score_hist = []
	112	+ N = 10
	113	+ good_features = set([])
	114	+ # Greedy feature selection loop
	115	+ while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
	116	+ scores = []
	117	+ for f in range(len(Xts)):
	118	+ if f not in good_features:
	119	+ feats = list(good_features) + [f]
	120	+ Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
	121	+ score = cv_loop(Xt, y, model, N)
	122	+ scores.append((score, f))
	123	+ print "Feature: %i Mean AUC: %f" % (f, score)
	124	+ good_features.add(sorted(scores)[-1][1])
	125	+ score_hist.append(sorted(scores)[-1])
	126	+ print "Current features: %s" % sorted(list(good_features))
	127	+
	128	+ # Remove last added feature from good_features
	129	+ good_features.remove(score_hist[-1][1])
	130	+ good_features = sorted(list(good_features))
	131	+ print "Selected features %s" % good_features
	132	+
	133	+ print "Performing hyperparameter selection..."
	134	+ # Hyperparameter selection loop
	135	+ score_hist = []
	136	+ Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
	137	+ Cvals = np.logspace(-4, 4, 15, base=2)
	138	+ for C in Cvals:
	139	+ model.C = C
	140	+ score = cv_loop(Xt, y, model, N)
	141	+ score_hist.append((score,C))
	142	+ print "C: %f Mean AUC: %f" %(C, score)
	143	+ bestC = sorted(score_hist)[-1][1]
	144	+ print "Best C value: %f" % (bestC)
	145	+
	146	+ print "Performing One Hot Encoding on entire dataset..."
	147	+ Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features]))
	148	+ Xt, keymap = OneHotEncoder(Xt)
	149	+ X_train = Xt[:num_train]
	150	+ X_test = Xt[num_train:]
	151	+
	152	+ print "Training full model..."
	153	+ model.fit(X_train, y)
	154	+
	155	+ print "Making prediction and saving results..."
	156	+ preds = model.predict_proba(X_test)[:,1]
	157	+ create_test_submission(submit, preds)
	158	+
	159	+if __name__ == "__main__":
	160	+ args = { 'train': 'train.csv',
	161	+ 'test': 'test.csv',
	162	+ 'submit': 'logistic_regression_pred.csv' }
	163	+ main(**args)
	164	+

myprojects-hg-reborn Fork