Fifa 2017 Player position Classifiers

5 minute read

At the Codenation website, there are several Data Science challenges to apply the Machine Learning knowledge in a practical way. One of these challenges is to discover the position of the players of FIFA© 2017 based on the characteristics of these players.

Development

Primeiramente, devemos importar as bibliotecas utilizadas:

Importing things

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import collections as cll
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Reading the data

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ans = pd.DataFrame()
# Check if the test data is in the training data
print(set(test.columns).issubset(set(train.columns)))
True

Quick data exploration

# What attributes do we have? What type are they?
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4829 entries, 0 to 4828
Data columns (total 55 columns):
ID                                    4829 non-null object
height_cm                             4829 non-null float64
weight_kg                             4829 non-null float64
eur_wage                              4829 non-null float64
skill_moves                           4829 non-null int64
weak_foot                             4829 non-null int64
crossing                              4829 non-null int64
finishing                             4829 non-null int64
heading_accuracy                      4829 non-null int64
short_passing                         4829 non-null int64
volleys                               4829 non-null int64
dribbling                             4829 non-null int64
curve                                 4829 non-null int64
free_kick_accuracy                    4829 non-null int64
long_passing                          4829 non-null int64
ball_control                          4829 non-null int64
acceleration                          4829 non-null int64
sprint_speed                          4829 non-null int64
agility                               4829 non-null int64
reactions                             4829 non-null int64
balance                               4829 non-null int64
shot_power                            4829 non-null int64
jumping                               4829 non-null int64
stamina                               4829 non-null int64
strength                              4829 non-null int64
long_shots                            4829 non-null int64
aggression                            4829 non-null int64
interceptions                         4829 non-null int64
positioning                           4829 non-null int64
vision                                4829 non-null int64
penalties                             4829 non-null int64
composure                             4829 non-null int64
marking                               4829 non-null int64
standing_tackle                       4829 non-null int64
body_type                             4829 non-null object
nationality                           4829 non-null object
preferred_foot                        4829 non-null object
outside_foot_shot_trait               4829 non-null bool
playmaker_trait                       4829 non-null bool
power_free_kick_trait                 4829 non-null bool
power_header_trait                    4829 non-null bool
puncher_trait                         4829 non-null bool
rushes_out_of_goal_trait              4829 non-null bool
saves_with_feet_trait                 4829 non-null bool
selfish_trait                         4829 non-null bool
skilled_dribbling_trait               4829 non-null bool
takes_finesse_free_kicks_trait        4829 non-null bool
target_forward_trait                  4829 non-null bool
team_player_trait                     4829 non-null bool
technical_dribbler_trait              4829 non-null bool
tries_to_beat_defensive_line_trait    4829 non-null bool
poacher_speciality                    4829 non-null bool
speedster_speciality                  4829 non-null bool
aerial_threat_speciality              4829 non-null bool
preferred_pos                         4829 non-null object
dtypes: bool(17), float64(3), int64(30), object(5)
memory usage: 1.5+ MB
# Statisticaly describing the data
train.describe()
height_cm weight_kg eur_wage skill_moves weak_foot crossing finishing heading_accuracy short_passing volleys ... strength long_shots aggression interceptions positioning vision penalties composure marking standing_tackle
count 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 ... 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000 4829.000000
mean 183.109961 76.990267 10016.359495 2.083868 2.819217 41.739283 39.410644 50.172292 52.684407 37.804307 ... 66.328847 40.354111 52.861876 42.873887 42.798716 47.659350 44.654380 54.402568 40.929178 43.800373
std 6.690962 7.292228 21720.982928 0.726900 0.628180 19.045680 20.721807 20.877075 16.181135 18.377069 ... 13.099721 19.790496 19.088383 21.958820 21.213381 14.071487 17.119506 14.146100 23.301924 23.571392
min 156.000000 49.000000 0.000000 1.000000 1.000000 6.000000 3.000000 4.000000 10.000000 4.000000 ... 20.000000 3.000000 11.000000 4.000000 2.000000 10.000000 5.000000 5.000000 4.000000 7.000000
25% 178.000000 72.000000 1000.000000 2.000000 2.000000 27.000000 21.000000 40.000000 42.000000 23.000000 ... 58.000000 22.000000 36.000000 22.000000 25.000000 37.000000 33.000000 46.000000 18.000000 19.000000
50% 184.000000 77.000000 3000.000000 2.000000 3.000000 43.000000 38.000000 55.000000 57.000000 37.000000 ... 67.000000 42.000000 56.000000 45.000000 47.000000 48.000000 45.000000 56.000000 42.000000 46.000000
75% 188.000000 82.000000 9000.000000 3.000000 3.000000 58.000000 58.000000 66.000000 65.000000 52.000000 ... 76.000000 57.000000 68.000000 63.000000 60.000000 58.000000 58.000000 65.000000 63.000000 66.000000
max 203.000000 107.000000 510000.000000 5.000000 5.000000 90.000000 94.000000 94.000000 88.000000 88.000000 ... 96.000000 90.000000 94.000000 90.000000 92.000000 86.000000 90.000000 91.000000 90.000000 92.000000

8 rows × 33 columns

# Checking the data size
print(train.shape)
print(test.shape)
(4829, 55)
(4829, 54)
# Finding out how many classes exists
train['preferred_pos'].nunique()
15

Spliting train and test sets

ans['ID'] = test['ID']
X_train = train.drop(['preferred_pos', 'ID'], axis = 1)
y_train = train['preferred_pos']
X_test = test.drop(['ID'], axis = 1)
# Check for nulls
count_nan_train = len(X_train) - X_train.count()
count_nan_test = len(X_test) - X_test.count()
print(count_nan_train.sum())
print(count_nan_test.sum())
0
0

Checking dataset balance

sns.countplot(y_train)
cll.Counter(y_train)
Counter({'prefers_gk': 965,
         'prefers_cm': 398,
         'prefers_st': 929,
         'prefers_cb': 1117,
         'prefers_rb': 318,
         'prefers_lb': 342,
         'prefers_cdm': 213,
         'prefers_cam': 157,
         'prefers_rw': 53,
         'prefers_lm': 124,
         'prefers_rm': 158,
         'prefers_lw': 37,
         'prefers_lwb': 4,
         'prefers_cf': 12,
         'prefers_rwb': 2})

data classes distribution

# Where the posision quantity is too low, change the label to a more commum position (to increase accuracy)
for i in range(len(y_train)):
    if y_train[i] == 'prefers_rwb' or y_train[i] == 'prefers_lwb' or y_train[i] =='prefers_lw' or y_train[i] =='prefers_cf' or y_train[i] =='prefers_rw':
        y_train[i] = 'prefers_cb'

Bodyshape Analysis

print(X_train['body_type'].unique())
print(X_test['body_type'].unique())
['Stocky' 'Normal' 'Lean' 'Courtois']
['Normal' 'Stocky' 'Lean' 'Akinfenwa' 'Neymar']
  • Courtois -> Lean
  • Akinfenwa -> Stocky
  • Neymar -> Lean
# Changing strange body types
for i in range(len(X_train)):
    if X_train['body_type'][i] == 'Courtois':
        X_train['body_type'][i] = 'Lean'

for i in range(len(X_test)):
    if X_test['body_type'][i] == 'Akinfenwa':
        X_test['body_type'][i] = 'Stocky'
    if X_test['body_type'][i] == 'Neymar':
        X_test['body_type'][i] = 'Lean'

Dummy variables

body_shape = pd.get_dummies(X_train['body_type'])
X_train = pd.concat([X_train, body_shape], axis = 1).drop(['body_type'], axis = 1)
X_train.drop(['nationality'], axis = 1, inplace = True)
preferred_foot = pd.get_dummies(X_train['preferred_foot'])
X_train = pd.concat([X_train, preferred_foot], axis = 1).drop(['preferred_foot'], axis = 1)

Correlation

# Correlation between the attributes
plt.figure(figsize=(12,12))
sns.heatmap(X_train.corr(),cmap='Blues')

style transfer example

Scaling attributes

features = X_train.columns
Scaler = preprocessing.StandardScaler()
X_train = Scaler.fit_transform(X_train)

Models

# Using k_folds to evaluate
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
scoring = 'accuracy'

Random Forest

%%time
rfc = RandomForestClassifier(n_estimators=100)
score = cross_val_score(rfc, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(round(np.mean(score)*100, 2))
86.35
CPU times: user 6.46 s, sys: 7.39 ms, total: 6.47 s
Wall time: 6.35 s

Naive Bayes

bnb = BernoulliNB()
score = cross_val_score(bnb, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(round(np.mean(score)*100, 2))
78.19

Neural Network

%%time
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(25, 15, 5), random_state=1,early_stopping=True)
score = cross_val_score(mlp, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(round(np.mean(score)*100, 2))
84.1
CPU times: user 36.7 s, sys: 27.3 ms, total: 36.8 s
Wall time: 9.2 s

XGBoost

n_estimators = [50, 100, 150, 200]
max_depth = [2, 4, 6, 8]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

xgb = XGBClassifier()

grid_search = GridSearchCV(xgb, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=k_fold, verbose=0)
score = cross_val_score(grid_search, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(round(np.mean(score)*100, 2))
87.31
best_xgb = grid_search.fit(X_train, y_train)
best_xgb.best_params_
{'max_depth': 4, 'n_estimators': 150}

Feature importance

# Fit on the Random Forest to calculate the feature importance for each attribute
rfc.fit(X_train, y_train)
importances = rfc.feature_importances_
indices = np.argsort(importances)
# Print the top 15 feature ranking
quant = 15
plt.title('Feature Importances')
plt.barh(range(quant), importances[indices][-quant:], color='b', align='center')
plt.yticks(range(quant), [features[i] for i in indices[-quant:]])
plt.xlabel('Relative Importance')
plt.show()

style transfer example

Making the submission

# Pre-processing the test data
body_shape = pd.get_dummies(X_test['body_type'])
X_test = pd.concat([X_test, body_shape], axis = 1).drop(['body_type'], axis = 1)

X_test.drop(['nationality'], axis = 1, inplace = True)

preferred_foot = pd.get_dummies(X_test['preferred_foot'])
X_test = pd.concat([X_test, preferred_foot], axis = 1).drop(['preferred_foot'], axis = 1)

X_test = Scaler.fit_transform(X_test)
# Making the prediction
prediction = best_xgb.predict(X_test)
ans['preferred_pos'] = prediction

ans.to_csv('answer.csv', index=False)

This submission was submitted to Codenation for review and resulted in 87.43% accuracy, which is not bad at all for this challenge.