Les principes du Machine Learning
Exemple par le code
print_example() # Source: http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html
Exemple: Le jeu de données Iris (catégorisation de type d'Iris)
from sklearn import datasets
iris = datasets.load_iris()
print("datapoint:",iris.data[0], "label:", iris.target[0])
print("datapoint:",iris.data[-1], "label:", iris.target[-1])
datapoint: [5.1 3.5 1.4 0.2] label: 0 datapoint: [5.9 3. 5.1 1.8] label: 2
On cherche à modéliser une fonction mathématique, qui a partir des données, va apprendre à prédire la cible.
import sklearn
from sklearn import linear_model
# Définition du modèle
sgd = linear_model.SGDClassifier(max_iter = 1000, tol = 1e-3)
# Optimisation des paramètres du modèle
sgd = sgd.fit(iris.data, iris.target)
predictions = sgd.predict(iris.data)
predictions
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
sklearn.metrics.accuracy_score(iris.target, predictions)
0.9533333333333334
L'erreur de biais dépend de:
Dans le cas du modèle linéaire, il faut que les données soient linéairement séparables.
On sépare les données en :
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(iris.data,iris.target, test_size= 0.5, shuffle = True, random_state = 0)
len(iris.data), len(x_train), len(x_test), len(y_train), len(y_test)
(150, 75, 75, 75, 75)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(iris.data,iris.target, test_size= 0.5, shuffle = True, random_state = 0)
# Définition du modèle
sgd = linear_model.SGDClassifier(max_iter = 10000, tol = 0)
# Optimisation des paramètres du modèle
sgd = sgd.fit(x_train, y_train)
predictions = sgd.predict(x_train)
sklearn.metrics.accuracy_score(y_train, predictions)
0.7333333333333333
predictions = sgd.predict(x_test)
sklearn.metrics.accuracy_score(y_test, predictions)
0.6
Lorsqu'un modèle a un biais très faible (c'est à dire une grosse expressivité) il peut "coller aux données"
Un modèle qui sur-apprend généralisera mal.
Tout ce qui a été raconté avant est vrai si:
On dit que les données dont I.I.D (Indépendament et Identiquement Distribuées).
pclass;survived;name;sex;age;sibsp;parch;ticket;fare;cabin;embarked;boat;body;home.dest
1;1;Allen, Miss. Elisabeth Walton;female;29;0;0;24160;211,3375;B5;S;2;;St Louis, MO
1;1;Allison, Master. Hudson Trevor;male;0,9167;1;2;113781;151,5500;C22 C26;S;11;;Montreal, PQ / Chesterville, ON
1;0;Allison, Miss. Helen Loraine;female;2;1;2;113781;151,5500;C22 C26;S;;;Montreal, PQ / Chesterville, ON
1;0;Allison, Mr. Hudson Joshua Creighton;male;30;1;2;113781;151,5500;C22 C26;S;;135;Montreal, PQ / Chesterville, ON
1;0;Allison, Mrs. Hudson J C (Bessie Waldo Daniels);female;25;1;2;113781;151,5500;C22 C26;S;;;Montreal, PQ / Chesterville, ON
1;1;Anderson, Mr. Harry;male;48;0;0;19952;26,5500;E12;S;3;;New York, NY
1;1;Andrews, Miss. Kornelia Theodosia;female;63;1;0;13502;77,9583;D7;S;10;;Hudson, NY
1;0;Andrews, Mr. Thomas Jr;male;39;0;0;112050;0,0000;A36;S;;;Belfast, NI
titanic_dataset = pandas.read_csv("data/titanic.csv", delimiter = ";")
del titanic_dataset["body"]
titanic_dataset.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | Allen, Miss. Elisabeth Walton | female | 29 | 0.0 | 0.0 | 24160 | 211,3375 | B5 | S | 2 | St Louis, MO |
1 | 1.0 | 1.0 | Allison, Master. Hudson Trevor | male | 0,9167 | 1.0 | 2.0 | 113781 | 151,5500 | C22 C26 | S | 11 | Montreal, PQ / Chesterville, ON |
2 | 1.0 | 0.0 | Allison, Miss. Helen Loraine | female | 2 | 1.0 | 2.0 | 113781 | 151,5500 | C22 C26 | S | NaN | Montreal, PQ / Chesterville, ON |
3 | 1.0 | 0.0 | Allison, Mr. Hudson Joshua Creighton | male | 30 | 1.0 | 2.0 | 113781 | 151,5500 | C22 C26 | S | NaN | Montreal, PQ / Chesterville, ON |
4 | 1.0 | 0.0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25 | 1.0 | 2.0 | 113781 | 151,5500 | C22 C26 | S | NaN | Montreal, PQ / Chesterville, ON |
# On sépare la colonne à prédire du reste des données
titanic_dataset = titanic_dataset.dropna(subset=['survived'])
survived = titanic_dataset["survived"]
del titanic_dataset["survived"]
# Définition d'un modèle linéaire
sgd = linear_model.SGDClassifier(max_iter = 10000)
# SGD = Descente de Gradient Stochastique - c'est une méthode d'optimisation
import traceback, sys
try:
sgd.fit(titanic_dataset, survived)
except Exception:
traceback.print_exc(file=sys.stdout)
# En erreur car les modèles ne prennent en entrée que des réels.
Traceback (most recent call last): File "<ipython-input-18-dee1661fe682>", line 3, in <module> sgd.fit(titanic_dataset, survived) File "/Users/rallesiardo/.local/share/virtualenvs/base/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py", line 743, in fit sample_weight=sample_weight) File "/Users/rallesiardo/.local/share/virtualenvs/base/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py", line 570, in _fit accept_large_sparse=False) File "/Users/rallesiardo/.local/share/virtualenvs/base/lib/python3.6/site-packages/sklearn/utils/validation.py", line 756, in check_X_y estimator=estimator) File "/Users/rallesiardo/.local/share/virtualenvs/base/lib/python3.6/site-packages/sklearn/utils/validation.py", line 527, in check_array array = np.asarray(array, dtype=dtype, order=order) File "/Users/rallesiardo/.local/share/virtualenvs/base/lib/python3.6/site-packages/numpy/core/numeric.py", line 538, in asarray return array(a, dtype, copy=False, order=order) ValueError: could not convert string to float: 'S'
titanic_dataset.head().transpose()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
pclass | 1 | 1 | 1 | 1 | 1 |
name | Allen, Miss. Elisabeth Walton | Allison, Master. Hudson Trevor | Allison, Miss. Helen Loraine | Allison, Mr. Hudson Joshua Creighton | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
sex | female | male | female | male | female |
age | 29 | 0,9167 | 2 | 30 | 25 |
sibsp | 0 | 1 | 1 | 1 | 1 |
parch | 0 | 2 | 2 | 2 | 2 |
ticket | 24160 | 113781 | 113781 | 113781 | 113781 |
fare | 211,3375 | 151,5500 | 151,5500 | 151,5500 | 151,5500 |
cabin | B5 | C22 C26 | C22 C26 | C22 C26 | C22 C26 |
embarked | S | S | S | S | S |
boat | 2 | 11 | NaN | NaN | NaN |
home.dest | St Louis, MO | Montreal, PQ / Chesterville, ON | Montreal, PQ / Chesterville, ON | Montreal, PQ / Chesterville, ON | Montreal, PQ / Chesterville, ON |
On sépare certaines colonnes complexes des autres pour les traiter ultérieurement
name = titanic_dataset["name"]
cabin = titanic_dataset["cabin"]
home_dest = titanic_dataset["home.dest"]
ticket = titanic_dataset["ticket"]
On encode les variables catégorielles via le codage disjonctif complet (one-hot encoding en anglais)
sex = pandas.get_dummies(titanic_dataset["sex"])
sex.head()
female | male | |
---|---|---|
0 | 1 | 0 |
1 | 0 | 1 |
2 | 1 | 0 |
3 | 0 | 1 |
4 | 1 | 0 |
titanic_dataset = pandas.concat([titanic_dataset,sex], axis = 1 )
titanic_dataset[["sex","female","male"]].head()
sex | female | male | |
---|---|---|---|
0 | female | 1 | 0 |
1 | male | 0 | 1 |
2 | female | 1 | 0 |
3 | male | 0 | 1 |
4 | female | 1 | 0 |
On formate correctement certaines variables numériques et on retire les variables textuelles
titanic_dataset["age"] = titanic_dataset["age"].replace(",",".", regex=True)
titanic_dataset["fare"] = titanic_dataset["fare"].replace(",",".", regex=True)
del titanic_dataset["ticket"]
del titanic_dataset["name"]
del titanic_dataset["cabin"]
del titanic_dataset["sex"]
del titanic_dataset["home.dest"]
del titanic_dataset["embarked"]
del titanic_dataset["boat"]
On remplace les cellules vides par les moyennes de leurs colonnes respectives.
for column in titanic_dataset.columns:
titanic_dataset[column] = titanic_dataset[column]\
.astype(np.float).fillna(titanic_dataset[column].astype(np.float).mean())
# Préparation de la validation croisée
x_train, x_test, y_train, y_test = train_test_split(titanic_dataset,survived, test_size= 0.5, random_state = 42)
# Définition d'un modèle linéaire
sgd = linear_model.SGDClassifier(max_iter = 50000)
# Optimisation des paramètres du modèle
sgd = sgd.fit(x_train, y_train)
accuracy_train = sklearn.metrics.accuracy_score(y_train, sgd.predict(x_train))
accuracy_train
0.7110091743119266
accuracy_test = sklearn.metrics.accuracy_score(y_test, sgd.predict(x_test))
accuracy_test
0.683969465648855
resultats_exp = pandas.DataFrame({"Modèle" : [], "Train" : [], "Test" : [], "Delta" : []})
import math
def add_score(nom, acc_train, acc_test):
resultats_exp.loc[len(resultats_exp)] = [nom, acc_train, acc_test, math.fabs(acc_train - acc_test)]
return resultats_exp
add_score("Linéaire",accuracy_train,accuracy_test)
Modèle | Train | Test | Delta | |
---|---|---|---|---|
0 | Linéaire | 0.711009 | 0.683969 | 0.02704 |
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=8, n_estimators = 20)
clf = clf.fit(x_train, y_train)
accuracy_train = sklearn.metrics.accuracy_score(y_train, clf.predict(x_train))
accuracy_train
0.8960244648318043
accuracy_test = sklearn.metrics.accuracy_score(y_test, clf.predict(x_test))
accuracy_test
0.7984732824427481
add_score("Forêt Aléatoire",accuracy_train,accuracy_test).sort_values(by = "Test", ascending = False)
Modèle | Train | Test | Delta | |
---|---|---|---|---|
1 | Forêt Aléatoire | 0.896024 | 0.798473 | 0.097551 |
0 | Linéaire | 0.711009 | 0.683969 | 0.027040 |
clf = RandomForestClassifier(max_depth=6, n_estimators = 100)
clf = clf.fit(x_train, y_train)
accuracy_train = sklearn.metrics.accuracy_score(y_train, clf.predict(x_train))
accuracy_train
0.8623853211009175
accuracy_test = sklearn.metrics.accuracy_score(y_test, clf.predict(x_test))
accuracy_test
0.7984732824427481
add_score("Forêt Aléatoire 2",accuracy_train,accuracy_test).sort_values(by = "Test", ascending = False)
Modèle | Train | Test | Delta | |
---|---|---|---|---|
1 | Forêt Aléatoire | 0.896024 | 0.798473 | 0.097551 |
2 | Forêt Aléatoire 2 | 0.862385 | 0.798473 | 0.063912 |
0 | Linéaire | 0.711009 | 0.683969 | 0.027040 |
titanic_dataset["child"] = titanic_dataset["age"] < 12
titanic_dataset["family"] = titanic_dataset["sibsp"] + titanic_dataset["parch"]
titanic_dataset.head().filter(["child","age"])
child | age | |
---|---|---|
0 | False | 29.0000 |
1 | True | 0.9167 |
2 | True | 2.0000 |
3 | False | 30.0000 |
4 | False | 25.0000 |
name[:10]
0 Allen, Miss. Elisabeth Walton 1 Allison, Master. Hudson Trevor 2 Allison, Miss. Helen Loraine 3 Allison, Mr. Hudson Joshua Creighton 4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) 5 Anderson, Mr. Harry 6 Andrews, Miss. Kornelia Theodosia 7 Andrews, Mr. Thomas Jr 8 Appleton, Mrs. Edward Dale (Charlotte Lamson) 9 Artagaveytia, Mr. Ramon Name: name, dtype: object
name.str.extract(r",(.*)\.")[:10]
0 | |
---|---|
0 | Miss |
1 | Master |
2 | Miss |
3 | Mr |
4 | Mrs |
5 | Mr |
6 | Miss |
7 | Mr |
8 | Mrs |
9 | Mr |
titles = pandas.get_dummies(name.str.extract(r", (.*)\.")\
.applymap(lambda x: x if x in ["Miss","Mr","Mrs"] else "LordSomething"))
titles.head()
0_LordSomething | 0_Miss | 0_Mr | 0_Mrs | |
---|---|---|---|---|
0 | 0 | 1 | 0 | 0 |
1 | 1 | 0 | 0 | 0 |
2 | 0 | 1 | 0 | 0 |
3 | 0 | 0 | 1 | 0 |
4 | 0 | 0 | 0 | 1 |
cabin_letter = pandas.get_dummies(cabin.str.extract("(.[A-Z])"))
cabin_number = cabin.str.extract("[A-Z](.[0-9])").fillna(0)
On ajoute les nouvelles variables au jeu de données
titanic_dataset = pandas.concat([titanic_dataset,titles,cabin_letter,cabin_number], axis = 1 )
On crée deux nouveaux jeux d'entrainement et de test.
x_train, x_test, y_train, y_test = train_test_split(titanic_dataset,survived, test_size= 0.5, random_state = 42)
# Optimisation des paramètres du modèle
sgd = sgd.fit(x_train, y_train)
accuracy_train = sklearn.metrics.accuracy_score(y_train, sgd.predict(x_train))
accuracy_train
0.7966360856269113
accuracy_test = sklearn.metrics.accuracy_score(y_test, sgd.predict(x_test))
accuracy_test
0.8
add_score("Linéaire 2",accuracy_train,accuracy_test).sort_values(by = "Test", ascending = False)
Modèle | Train | Test | Delta | |
---|---|---|---|---|
3 | Linéaire 2 | 0.796636 | 0.800000 | 0.003364 |
1 | Forêt Aléatoire | 0.896024 | 0.798473 | 0.097551 |
2 | Forêt Aléatoire 2 | 0.862385 | 0.798473 | 0.063912 |
0 | Linéaire | 0.711009 | 0.683969 | 0.027040 |
clf = RandomForestClassifier(max_depth=5, n_estimators = 100)
clf = clf.fit(x_train, y_train)
accuracy_train = sklearn.metrics.accuracy_score(y_train, clf.predict(x_train))
accuracy_train
0.845565749235474
accuracy_test = sklearn.metrics.accuracy_score(y_test, clf.predict(x_test))
accuracy_test
0.816793893129771
add_score("Forêt Aléatoire 3",accuracy_train,accuracy_test).sort_values(by = "Test", ascending = False)
Modèle | Train | Test | Delta | |
---|---|---|---|---|
4 | Forêt Aléatoire 3 | 0.845566 | 0.816794 | 0.028772 |
3 | Linéaire 2 | 0.796636 | 0.800000 | 0.003364 |
1 | Forêt Aléatoire | 0.896024 | 0.798473 | 0.097551 |
2 | Forêt Aléatoire 2 | 0.862385 | 0.798473 | 0.063912 |
0 | Linéaire | 0.711009 | 0.683969 | 0.027040 |
from tqdm import tqdm_notebook as tqdm
# Selection du modèle linéaire
best_model = None
max_accuracy = 0
max_accuracy_train = 0
for i in tqdm(range(10000)):
PARAM = {"max_iter" : np.random.randint(2000,100000),\
"penalty" : np.random.choice(["none", "l2", "l1", "elasticnet"]),\
"alpha": np.random.uniform(0.00001,0.001),\
"tol" : np.random.uniform(0,1e-3)}
sgd = linear_model.SGDClassifier(**PARAM)
sgd = sgd.fit(x_train, y_train)
accuracy_train = sklearn.metrics.accuracy_score(y_train, sgd.predict(x_train))
accuracy_test = sklearn.metrics.accuracy_score(y_test, sgd.predict(x_test))
if accuracy_test > max_accuracy:
max_accuracy = accuracy_test
max_accuracy_train = accuracy_train
best_model = sgd
print(best_model)
print(max_accuracy_train, max_accuracy)
add_score("RS Linéaire",max_accuracy_train,max_accuracy).sort_values(by = "Test", ascending = False)
SGDClassifier(alpha=0.0006810136812994646, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=83154, n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5, random_state=None, shuffle=True, tol=0.0002960762746992756, validation_fraction=0.1, verbose=0, warm_start=False) 0.7981651376146789 0.8091603053435115
Modèle | Train | Test | Delta | |
---|---|---|---|---|
4 | Forêt Aléatoire 3 | 0.845566 | 0.816794 | 0.028772 |
5 | RS Linéaire | 0.798165 | 0.809160 | 0.010995 |
3 | Linéaire 2 | 0.796636 | 0.800000 | 0.003364 |
1 | Forêt Aléatoire | 0.896024 | 0.798473 | 0.097551 |
2 | Forêt Aléatoire 2 | 0.862385 | 0.798473 | 0.063912 |
0 | Linéaire | 0.711009 | 0.683969 | 0.027040 |
from tqdm import tqdm_notebook as tqdm
# Selection de la forêt aléatoire
best_model = None
max_accuracy = 0
best_model_train_accuracy = 0
for i in tqdm(range(1000)):
PARAM = {"max_depth" : np.random.randint(4,10),\
"n_estimators" : np.random.randint(10,200),\
"min_samples_split": np.random.randint(2,10),\
"min_samples_leaf": np.random.randint(1,10)}
clf = RandomForestClassifier(**PARAM)
clf = clf.fit(x_train, y_train)
accuracy_train = sklearn.metrics.accuracy_score(y_train, clf.predict(x_train))
accuracy_test = sklearn.metrics.accuracy_score(y_test, clf.predict(x_test))
if accuracy_test > max_accuracy:
max_accuracy = accuracy_test
best_model_train_accuracy = accuracy_train
best_model = clf
print(best_model)
print(best_model_train_accuracy, max_accuracy)
add_score("RS Forêt Aléatoire",max_accuracy_train,max_accuracy).sort_values(by = "Test", ascending = False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=5, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=8, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) 0.8287461773700305 0.8229007633587786
Modèle | Train | Test | Delta | |
---|---|---|---|---|
6 | RS Forêt Aléatoire | 0.798165 | 0.822901 | 0.024736 |
4 | Forêt Aléatoire 3 | 0.845566 | 0.816794 | 0.028772 |
5 | RS Linéaire | 0.798165 | 0.809160 | 0.010995 |
3 | Linéaire 2 | 0.796636 | 0.800000 | 0.003364 |
1 | Forêt Aléatoire | 0.896024 | 0.798473 | 0.097551 |
2 | Forêt Aléatoire 2 | 0.862385 | 0.798473 | 0.063912 |
0 | Linéaire | 0.711009 | 0.683969 | 0.027040 |
Elys ALLESIARDO