FantasyFootball marks prediction

In this second part of the analysis, we will apply some machine learning techniques to predict the performances of the football players.

#import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
marks_final= pd.read_csv('dataset_2015_marks.csv', index_col=0)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

marks_final.rename(columns = {'LastMark':'lastMark'})
marks_final.head()

	player	name	team	day	mark	rankOwnTeam	playingHome	opponentTeam	rankOpponentTeam	avgPrevious5	avgSoFar	LastMark	isSuff
0	2247	Sportiello	Atalanta	1	7.0	10	1	Verona	10	NaN	NaN	NaN	1
1	2247	Sportiello	Atalanta	2	7.5	10	0	Cagliari	10	7.000000	7.000000	7.0	1
2	2247	Sportiello	Atalanta	3	6.0	5	1	Fiorentina	15	7.250000	7.250000	7.5	1
3	2247	Sportiello	Atalanta	4	6.5	9	0	Inter	6	6.833333	6.833333	6.0	1
4	2247	Sportiello	Atalanta	5	5.0	11	1	Juventus	1	6.750000	6.750000	6.5	0

As a filling value for the statistics, let us put 5 at the moment. Not too bad but not good. Let’s say that in the doubt we are peximistic

marks_final = marks_final.fillna(5)

marks_final.head()

	player	name	team	day	mark	rankOwnTeam	playingHome	opponentTeam	rankOpponentTeam	avgPrevious5	avgSoFar	LastMark	isSuff
0	2247	Sportiello	Atalanta	1	7.0	10	1	Verona	10	5.000000	5.000000	5.0	1
1	2247	Sportiello	Atalanta	2	7.5	10	0	Cagliari	10	7.000000	7.000000	7.0	1
2	2247	Sportiello	Atalanta	3	6.0	5	1	Fiorentina	15	7.250000	7.250000	7.5	1
3	2247	Sportiello	Atalanta	4	6.5	9	0	Inter	6	6.833333	6.833333	6.0	1
4	2247	Sportiello	Atalanta	5	5.0	11	1	Juventus	1	6.750000	6.750000	6.5	0

desc = marks_final.describe() 

print(desc)

            player          day         goal      penalty      owngoal  \
count  9904.000000  9904.000000  9904.000000  9904.000000  9904.000000   
mean   2468.801393    19.501817     0.091680     0.008380     0.003332   
std     140.635213    10.996809     0.315661     0.093354     0.057630   
min    2247.000000     1.000000     0.000000     0.000000     0.000000   
25%    2351.000000    10.000000     0.000000     0.000000     0.000000   
50%    2458.000000    20.000000     0.000000     0.000000     0.000000   
75%    2572.000000    29.000000     0.000000     0.000000     0.000000   
max    2820.000000    38.000000     3.000000     2.000000     1.000000   

              mark  rankOwnTeam  playingHome  rankOpponentTeam  avgPrevious5  \
count  9904.000000  9904.000000  9904.000000       9904.000000   9904.000000   
mean      5.913318    10.341882     0.498485         10.332593      5.859603   
std       0.690952     5.673185     0.500023          5.664493      0.500290   
min       3.000000     1.000000     0.000000          1.000000      4.000000   
25%       5.500000     5.000000     0.000000          5.000000      5.500000   
50%       6.000000    10.000000     0.000000         10.000000      5.900000   
75%       6.500000    15.000000     1.000000         15.000000      6.200000   
max       8.500000    20.000000     1.000000         20.000000      8.250000   

          avgSoFar     LastMark       isSuff  
count  9904.000000  9904.000000  9904.000000  
mean      5.884887     5.722032     0.622779  
std       0.389586     0.725208     0.484716  
min       4.000000     4.000000     0.000000  
25%       5.684211     5.000000     0.000000  
50%       5.928571     5.500000     1.000000  
75%       6.125000     6.000000     1.000000  
max       8.000000     8.500000     1.000000  

Ok now we should switch on our brain. In order to maximize the performance of our classifiers, we should:

study the correlated features (a lot of them, especially the engineered ones)
pay attention to the categorical features (even if we transform them into integers, we do not want that our algorithm think that they could be ordererd)
some of the algorithms we want to try expects features centered in zero (we will surely try SVM since our predicted lable is binary)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Get one hot encoding of columns team
one_hot = pd.get_dummies(marks_final['team'])
marks_final = marks_final.join(one_hot)
marks_final.head()

	player	name	team	day	mark	rankOwnTeam	playingHome	...
0	2247	Sportiello	Atalanta	1	7.0	10	1	...
1	2247	Sportiello	Atalanta	2	7.5	10	0	...
2	2247	Sportiello	Atalanta	3	6.0	5	1	...
3	2247	Sportiello	Atalanta	4	6.5	9	0	...
4	2247	Sportiello	Atalanta	5	5.0	11	1	...

5 rows × 36 columns

We should do the same with the opponent. If we do this, though, there could be some overlapping. so we have to rename the teams if we are talking about opponents

one_hot = pd.get_dummies(marks_final['opponentTeam'])
one_hot.head()

	Cagliari	Fiorentina	Inter	Juventus	Verona
0	0	0	0	0	1
1	1	0	0	0	0
2	0	1	0	0	0
3	0	0	1	0	0
4	0	0	0	1	0

one_hot.columns = "Opp:"+one_hot.columns
one_hot.head()

	Opp:Cagliari	Opp:Fiorentina	Opp:Inter	Opp:Juventus	Opp:Verona
0	0	0	0	0	1
1	1	0	0	0	0
2	0	1	0	0	0
3	0	0	1	0	0
4	0	0	0	1	0

marks_final = marks_final.join(one_hot)
marks_final.head()

	player	name	team	day	mark	rankOwnTeam	playingHome	...	Opp:Verona
0	2247	Sportiello	Atalanta	1	7.0	10	1	...	1
1	2247	Sportiello	Atalanta	2	7.5	10	0	...	0
2	2247	Sportiello	Atalanta	3	6.0	5	1	...	0
3	2247	Sportiello	Atalanta	4	6.5	9	0	...	0
4	2247	Sportiello	Atalanta	5	5.0	11	1	...	0

5 rows × 56 columns

ok done! focus now on correlations

import seaborn as sns

marks_final.columns

Index([u'player', u'name', u'team', u'day', u'goal', u'penalty', u'owngoal',
       u'mark', u'rankOwnTeam', u'playingHome', u'opponentTeam',
       u'rankOpponentTeam', u'avgPrevious5', u'avgSoFar', u'LastMark',
       u'isSuff', u'Atalanta', u'Cagliari', u'Cesena', u'Chievo', u'Empoli',
       u'Fiorentina', u'Genoa', u'Inter', u'Juventus', u'Lazio', u'Milan',
       u'Napoli', u'Palermo', u'Parma', u'Roma', u'Sampdoria', u'Sassuolo',
       u'Torino', u'Udinese', u'Verona', u'Opp:Atalanta', u'Opp:Cagliari',
       u'Opp:Cesena', u'Opp:Chievo', u'Opp:Empoli', u'Opp:Fiorentina',
       u'Opp:Genoa', u'Opp:Inter', u'Opp:Juventus', u'Opp:Lazio', u'Opp:Milan',
       u'Opp:Napoli', u'Opp:Palermo', u'Opp:Parma', u'Opp:Roma',
       u'Opp:Sampdoria', u'Opp:Sassuolo', u'Opp:Torino', u'Opp:Udinese',
       u'Opp:Verona'],
      dtype='object')

marks_final.columns[:16]

Index([u'player', u'name', u'team', u'day', u'goal', u'penalty', u'owngoal',
       u'mark', u'rankOwnTeam', u'playingHome', u'opponentTeam',
       u'rankOpponentTeam', u'avgPrevious5', u'avgSoFar', u'LastMark',
       u'isSuff'],
      dtype='object')

marks_final.columns[:16].difference(['player','name','team','opponentTeam'])

Index([u'LastMark', u'avgPrevious5', u'avgSoFar', u'day', u'goal', u'isSuff',
       u'mark', u'owngoal', u'penalty', u'playingHome', u'rankOpponentTeam',
       u'rankOwnTeam'],
      dtype='object')

corr = marks_final[marks_final.columns[:16].difference(['player','name','team','opponentTeam','goal','owngoal','mark','penalty'])].corr()

_ = sns.heatmap(corr, annot = True)

plt.show()

png

corr = marks_final[marks_final.columns[:16].difference(['player','name','team','opponentTeam','goal','owngoal','mark','penalty'])].corr(method='spearman')
sns.heatmap(corr, annot = True)
plt.show()

png

from scipy.stats import pointbiserialr

We have used spearman and pearson. We have seen that the only features strongly correlated are the averages, as predictable. let’s use biserial correlation for binary features

 
def correlation(X,Y):
    param = []
    correlation=[]
    abs_corr=[]
    columns = marks_final.columns[:16].difference(['player','name','team','opponentTeam','goal','owngoal','mark','penalty'])
    
    
    for c in columns:
        corr = pointbiserialr(X[c],Y)[0]
        correlation.append(corr)
        param.append(c)
        abs_corr.append(abs(corr))
    print correlation
    #create data frame
    param_cor = pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr})
    paramc_cor=param_cor.sort_values(by=['abs_corr'], ascending=False)
    param_cor=param_cor.set_index('parameter')


    print param_cor

correlation(marks_final, marks_final['isSuff'])

[0.08503341229851559, 0.099145358562043442, 0.112238463187295, -0.00042075422779776981, 1.0, 0.078053212529890642, 0.013959624950329314, -0.039171776122582443]
                  abs_corr  correlation
parameter                              
LastMark          0.085033     0.085033
avgPrevious5      0.099145     0.099145
avgSoFar          0.112238     0.112238
day               0.000421    -0.000421
isSuff            1.000000     1.000000
playingHome       0.078053     0.078053
rankOpponentTeam  0.013960     0.013960
rankOwnTeam       0.039172    -0.039172

ok we do not have high correlated features with the predicted variable is suff. We do have some features which are, as predictable, highly correlated, such the ones related to averages.

let’s try to redo the experiments for the team opponents.

marks_final.columns[36:]

Index([u'Opp:Atalanta', u'Opp:Cagliari', u'Opp:Cesena', u'Opp:Chievo',
       u'Opp:Empoli', u'Opp:Fiorentina', u'Opp:Genoa', u'Opp:Inter',
       u'Opp:Juventus', u'Opp:Lazio', u'Opp:Milan', u'Opp:Napoli',
       u'Opp:Palermo', u'Opp:Parma', u'Opp:Roma', u'Opp:Sampdoria',
       u'Opp:Sassuolo', u'Opp:Torino', u'Opp:Udinese', u'Opp:Verona'],
      dtype='object')

def correlation(X,Y):
    param = []
    correlation=[]
    abs_corr=[]
    columns = marks_final.columns[16:]
    
    
    for c in columns:
        corr = pointbiserialr(X[c],Y)[0]
        correlation.append(corr)
        param.append(c)
        abs_corr.append(abs(corr))
    print correlation
    #create data frame
    param_cor = pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr})
    paramc_cor=param_cor.sort_values(by=['abs_corr'], ascending=False)
    param_cor=param_cor.set_index('parameter')


    print param_cor

correlation(marks_final, marks_final['isSuff'])

[-0.017349136227901987, -0.036133080109451494, -0.059240064845519434, 0.020289813965431006, 0.04118256909444723, -0.0038137871375006787, 0.05167221198207473, -0.017802931620200715, 0.059496601376935604, 0.032930901024136615, -0.035363367437981984, -0.036690427295522327, 0.022779323203295853, -0.042939424860968065, 0.010858518686822215, 0.01521466981818387, 0.0029682956718978639, 0.020494875022458978, -0.018986791346488622, -0.0086492008136130494, -0.003950159001969422, 0.031615265415889146, 0.037927714008444018, -0.010779393950454609, -0.01689223144802195, -0.0050433571094218007, -0.025890348584482872, 0.029125033262011778, -0.063246168754700724, -0.03445493463979013, 0.011208747273504476, -0.017872692787034648, -0.012948660103458427, 0.026839801947749712, -0.00075271197474548049, 0.020702477080305686, 0.01537868962421509, -0.022441881671146906, 0.025212354963465747, 0.01641259848540369]
                abs_corr  correlation
parameter                            
Atalanta        0.017349    -0.017349
Cagliari        0.036133    -0.036133
Cesena          0.059240    -0.059240
Chievo          0.020290     0.020290
Empoli          0.041183     0.041183
Fiorentina      0.003814    -0.003814
Genoa           0.051672     0.051672
Inter           0.017803    -0.017803
Juventus        0.059497     0.059497
Lazio           0.032931     0.032931
Milan           0.035363    -0.035363
Napoli          0.036690    -0.036690
Palermo         0.022779     0.022779
Parma           0.042939    -0.042939
Roma            0.010859     0.010859
Sampdoria       0.015215     0.015215
Sassuolo        0.002968     0.002968
Torino          0.020495     0.020495
Udinese         0.018987    -0.018987
Verona          0.008649    -0.008649
Opp:Atalanta    0.003950    -0.003950
Opp:Cagliari    0.031615     0.031615
Opp:Cesena      0.037928     0.037928
Opp:Chievo      0.010779    -0.010779
Opp:Empoli      0.016892    -0.016892
Opp:Fiorentina  0.005043    -0.005043
Opp:Genoa       0.025890    -0.025890
Opp:Inter       0.029125     0.029125
Opp:Juventus    0.063246    -0.063246
Opp:Lazio       0.034455    -0.034455
Opp:Milan       0.011209     0.011209
Opp:Napoli      0.017873    -0.017873
Opp:Palermo     0.012949    -0.012949
Opp:Parma       0.026840     0.026840
Opp:Roma        0.000753    -0.000753
Opp:Sampdoria   0.020702     0.020702
Opp:Sassuolo    0.015379     0.015379
Opp:Torino      0.022442    -0.022442
Opp:Udinese     0.025212     0.025212
Opp:Verona      0.016413     0.016413

From this numbers we can say that there is slight probability to have a good score if:

you play for Genoa, Juventus, which are probably the most organized teams
you play against: Cesena, Cagliari

and a slight probability to have a bad score if

you play for Cesena and Parma
you play against Juventus. There is an interesting difference between 2nd and 3rd teams in the final ranking which are Roma and Lazio. Playing against Lazio seems to be harder in terms of final mark.

This info could be precious: a well organized team will probably delivers pretty high marks even if they are not aiming to the final winning. Of course, owning a player of a top team is quite always a pro.

To be sure, let’s try to look at the most interesting features given this results

X = marks_final[marks_final.columns[:16].difference(['player','name','team','opponentTeam','goal','owngoal','mark','penalty','isSuff'])]
Y = marks_final['isSuff']
select_top_4 = SelectKBest(score_func=chi2, k =4)

fit = select_top_4.fit(X,Y)
features = fit.transform(X)

features[0:10]

array([[  5. ,   1. ,  10. ,  10. ],
       [  7. ,   0. ,  10. ,  10. ],
       [  7.5,   1. ,  15. ,   5. ],
       [  6. ,   0. ,   6. ,   9. ],
       [  6.5,   1. ,   1. ,  11. ],
       [  5. ,   0. ,   4. ,  15. ],
       [  6.5,   1. ,  19. ,  16. ],
       [  6. ,   0. ,   5. ,  13. ],
       [  6. ,   1. ,   7. ,  15. ],
       [  8. ,   0. ,  12. ,  16. ]])

marks_final[marks_final.columns[:16].difference(['player','name','team','opponentTeam','goal','owngoal','mark','penalty','isSuff'])]

	LastMark	avgPrevious5	avgSoFar	day	playingHome	rankOpponentTeam	rankOwnTeam
0	5.0	5.000000	5.000000	1	1	10	10
1	7.0	7.000000	7.000000	2	0	10	10
2	7.5	7.250000	7.250000	3	1	15	5
3	6.0	6.833333	6.833333	4	0	6	9
4	6.5	6.750000	6.750000	5	1	1	11
5	5.0	6.400000	6.400000	6	0	4	15
6	6.5	6.300000	6.416667	7	1	19	16
7	6.0	6.000000	6.357143	8	0	5	13
8	6.0	6.000000	6.312500	9	1	7	15
9	8.0	6.300000	6.500000	10	0	12	16
10	6.5	6.600000	6.500000	11	0	14	15
11	7.0	6.700000	6.545455	12	1	2	16
12	6.0	6.700000	6.500000	13	0	14	17
13	7.5	7.000000	6.576923	14	1	19	16
14	6.0	6.600000	6.535714	15	0	6	14
15	6.5	6.600000	6.533333	16	1	9	15
16	5.5	6.300000	6.468750	17	0	6	17
17	6.5	6.400000	6.470588	18	1	16	17
18	7.0	6.300000	6.500000	19	0	8	17
19	6.0	6.300000	6.473684	20	0	14	15
20	6.5	6.300000	6.475000	21	1	16	15
21	6.5	6.500000	6.476190	22	0	6	15
22	6.0	6.400000	6.454545	23	1	11	15
23	5.5	6.100000	6.413043	24	0	1	17
24	7.0	6.300000	6.437500	25	1	6	17
25	5.0	6.000000	6.380000	26	0	20	17
26	7.0	6.100000	6.403846	27	1	12	17
27	6.0	6.100000	6.388889	28	0	4	17
28	6.5	6.300000	6.392857	29	1	8	17
29	5.5	6.000000	6.362069	30	1	11	17
...	...	...	...	...	...	...	...
9874	5.0	5.000000	5.000000	31	1	9	10
9875	6.5	6.500000	6.500000	32	1	2	10
9876	5.0	5.000000	5.000000	32	0	7	18
9877	5.0	5.500000	5.500000	34	0	7	18
9878	5.0	5.500000	5.500000	37	1	18	19
9879	5.0	5.000000	5.000000	32	0	20	11
9880	5.0	6.000000	6.000000	34	0	16	11
9881	6.0	6.000000	6.000000	35	1	17	10
9882	5.0	5.000000	5.000000	34	1	20	18
9883	5.0	6.000000	6.000000	37	0	19	18
9884	6.0	6.000000	6.000000	38	1	16	18
9885	5.0	5.000000	5.000000	34	1	11	16
9886	5.0	6.000000	6.000000	36	1	10	16
9887	6.5	6.250000	6.250000	37	0	15	13
9888	5.0	5.000000	5.000000	35	0	10	17
9889	5.0	5.000000	5.000000	35	0	10	17
9890	5.0	5.000000	5.000000	35	1	18	1
9891	5.0	5.000000	5.000000	35	0	13	6
9892	5.0	6.000000	6.000000	37	0	15	7
9893	5.0	5.000000	5.000000	37	0	14	17
9894	5.0	5.000000	5.000000	37	1	9	10
9895	5.0	5.000000	5.000000	37	0	10	9
9896	6.0	6.000000	6.000000	38	1	20	9
9897	5.0	5.000000	5.000000	37	1	13	15
9898	6.5	6.500000	6.500000	38	0	18	16
9899	5.0	5.000000	5.000000	38	0	9	20
9900	5.0	5.000000	5.000000	38	1	11	2
9901	5.0	5.000000	5.000000	38	1	11	2
9902	5.0	5.000000	5.000000	38	1	20	9
9903	5.0	5.000000	5.000000	38	0	18	16

9904 rows × 7 columns

It seems that the most significant features are last mark, playinghome and the two ranks

X_features = pd.DataFrame(data = marks_final, columns = ["LastMark","playingHome","rankOwnTeam","rankOpponentTeam"])
X_features.head()

	LastMark	playingHome	rankOwnTeam	rankOpponentTeam
0	5.0	1	10	10
1	7.0	0	10	10
2	7.5	1	5	15
3	6.0	0	9	6
4	6.5	1	11	1

Y = marks_final['isSuff']

Y.describe()

count    9904.000000
mean        0.622779
std         0.484716
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: isSuff, dtype: float64

Since some of the algorithms assumes Gaussian distribution, let’s do some standardization

from sklearn.preprocessing import StandardScaler
rescaledX = StandardScaler().fit_transform(X_features[X_features.columns.difference(['playingHome'])])

rescaledX = pd.DataFrame(data = rescaledX, columns= X_features.columns.difference(['playingHome']))

rescaledX.head()

	LastMark	rankOpponentTeam	rankOwnTeam
0	-0.995670	-0.058718	-0.060266
1	1.762298	-0.058718	-0.060266
2	2.451790	0.824018	-0.941649
3	0.383314	-0.764907	-0.236543
4	1.072806	-1.647643	0.116011

rescaledX = rescaledX.join(marks_final['playingHome'])

X = rescaledX

For this first exploratory run, since the dataset is not so big, try as many algorithms as we can.

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, random_state = 42, test_size = 0.2)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier

models = []
models.append(("LRegression",LogisticRegression()))
models.append(("NaiveBayesian",GaussianNB()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("DTree",DecisionTreeClassifier()))
models.append(("RForest",RandomForestClassifier()))
models.append(("NeuralNetworkMLP",MLPClassifier()))
models.append(("SVM",SVC()))

results = []
names = []
for name,model in models:
    kfold = KFold(n_splits=10, random_state=42)
    cv_result = cross_val_score(model,X_train,Y_train, cv = kfold,scoring = "accuracy")
    names.append(name)
    results.append(cv_result)
for i in range(len(names)):
    print(names[i],results[i].mean())

('LRegression', 0.63119642197511039)
('NaiveBayesian', 0.63144846956322376)
('KNN', 0.63510785662425007)
('DTree', 0.62526462608429823)
('RForest', 0.6348580381367267)
('NeuralNetworkMLP', 0.63043980154635904)
('SVM', 0.63094373750111454)

ax = sns.boxplot(data=results)
ax.set_xticklabels(names)
plt.show()

png

( Credits to Arun Prakash (Kaggle) for the very funny visualization )

let’s repeat it with other features (the team and the opponent)

rescaledX = rescaledX.join(marks_final[marks_final.columns[16:]])
rescaledX.head()

	LastMark	rankOpponentTeam	rankOwnTeam	playingHome	Atalanta	...	Opp:Verona
0	-0.995670	-0.058718	-0.060266	1	1	...	1
1	1.762298	-0.058718	-0.060266	0	1	...	0
2	2.451790	0.824018	-0.941649	1	1	...	0
3	0.383314	-0.764907	-0.236543	0	1	...	0
4	1.072806	-1.647643	0.116011	1	1	...	0

5 rows × 44 columns

X = rescaledX

X_train,X_test,Y_train,Y_test = train_test_split(X,Y, random_state = 42, test_size = 0.2)
results = []
names = []
for name,model in models:
    kfold = KFold(n_splits=10, random_state=42)
    cv_result = cross_val_score(model,X_train,Y_train, cv = kfold,scoring = "accuracy")
    names.append(name)
    results.append(cv_result)
for i in range(len(names)):
    print(names[i],results[i].mean())

('LRegression', 0.64243204427630651)
('NaiveBayesian', 0.57566490886163002)
('KNN', 0.68257114015310738)
('DTree', 0.64571662399531249)
('RForest', 0.66262689951214537)
('NeuralNetworkMLP', 0.68913186085317235)
('SVM', 0.63182630211318735)

ax = sns.boxplot(data=results)
ax.set_xticklabels(names)
plt.show()

png

Knn and Neural Networks seems to be the most promising techniques. We have just tried them with their default configurations.

knn = KNeighborsClassifier()
knn.fit(X_train,Y_train)
predictions = knn.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(accuracy_score(Y_test,predictions))

0.653205451792

print(classification_report(Y_test,predictions))

             precision    recall  f1-score   support

          0       0.59      0.49      0.53       811
          1       0.68      0.77      0.72      1170

avg / total       0.65      0.65      0.65      1981

conf = confusion_matrix(Y_test,predictions)

label = ["0","1"]
sns.heatmap(conf, annot=True, xticklabels=label, yticklabels=label)
plt.show()

png

MLP = MLPClassifier()
MLP.fit(X_train,Y_train)
predictions = MLP.predict(X_test)

print(accuracy_score(Y_test,predictions))

0.668854114084

print(classification_report(Y_test,predictions))

             precision    recall  f1-score   support

          0       0.64      0.45      0.53       811
          1       0.68      0.82      0.75      1170

avg / total       0.66      0.67      0.66      1981

conf = confusion_matrix(Y_test,predictions)

label = ["0","1"]
sns.heatmap(conf, annot=True, xticklabels=label, yticklabels=label)
plt.show()

png

Not bad! .. To be continued

Written on March 7, 2017

	Cagliari	Fiorentina	Inter	Juventus	Verona
0	0	0	0	0	1
1	1	0	0	0	0
2	0	1	0	0	0
3	0	0	1	0	0
4	0	0	0	1	0

	Opp:Cagliari	Opp:Fiorentina	Opp:Inter	Opp:Juventus	Opp:Verona
0	0	0	0	0	1
1	1	0	0	0	0
2	0	1	0	0	0
3	0	0	1	0	0
4	0	0	0	1	0

	Cagliari	Fiorentina	Inter	Juventus	Verona
0	0	0	0	0	1
1	1	0	0	0	0
2	0	1	0	0	0
3	0	0	1	0	0
4	0	0	0	1	0

	Opp:Cagliari	Opp:Fiorentina	Opp:Inter	Opp:Juventus	Opp:Verona
0	0	0	0	0	1
1	1	0	0	0	0
2	0	1	0	0	0
3	0	0	1	0	0
4	0	0	0	1	0

	Cagliari	Fiorentina	Inter	Juventus	Verona
0	0	0	0	0	1
1	1	0	0	0	0
2	0	1	0	0	0
3	0	0	1	0	0
4	0	0	0	1	0

	Opp:Cagliari	Opp:Fiorentina	Opp:Inter	Opp:Juventus	Opp:Verona
0	0	0	0	0	1
1	1	0	0	0	0
2	0	1	0	0	0
3	0	0	1	0	0
4	0	0	0	1	0