2016-06-30 63 views
10

Ho uno script che genera casualmente una serie di dati e treni diversi classificatori per confrontarli uno contro l'altro (è molto simile a http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html):formazione diversa scikit-learn classificatori su più CPU per ogni iterazione

from itertools import product 

import numpy as np 

from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier 
from sklearn.naive_bayes import GaussianNB, MultinomialNB 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 

from sklearn.datasets import make_classification 
from sklearn.preprocessing import StandardScaler 
from sklearn.cross_validation import train_test_split 

names = ["Linear SVM", "Decision Tree", 
    "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis", 
    "Quadratic Discriminant Analysis"] 

def griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove): 
    classifiers = [ 
     SVC(kernel="linear", C=0.025), 
     DecisionTreeClassifier(max_depth=5), 
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 
     AdaBoostClassifier(), GaussianNB(), 
     LinearDiscriminantAnalysis(), 
     QuadraticDiscriminantAnalysis()] 

    classifiers2 = [ 
     SVC(kernel="linear", C=0.025), 
     DecisionTreeClassifier(max_depth=5), 
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 
     AdaBoostClassifier(), GaussianNB(), 
     LinearDiscriminantAnalysis(), 
     QuadraticDiscriminantAnalysis()] 

    X, y = make_classification(n_samples=num_samples, n_features=num_feats, n_redundant=0, n_informative=2, 
          random_state=1, n_clusters_per_class=1) 
    X = StandardScaler().fit_transform(X) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) 

    for name, clf, clf2 in zip(names, classifiers, classifiers2): 
     clf.fit(X_train, y_train) 
     score = clf.score(X_test, y_test) 
     # Remove 40% of the features. 
     clf2.fit(X_train[:,:-num_feats_to_remove], y_train) 
     score2 = clf2.score(X_test[:,:-num_feats_to_remove], y_test) 
     yield (num_samples, num_feats, num_feats_to_remove, name, score, score2) 

E per eseguirlo:

_samples = [100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000] 
_feats = [10, 20, 50, 100, 200, 500, 10000] 
_feats_to_rm = [5, 10, 25, 50, 100, 250] 
for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm): 
    if num_feats <= num_feats_to_remove: 
     continue 
    for i in griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove): 
     print (i) 

Lo script genera qualcosa come:

(100, 10, 5, 'Linear SVM', 1.0, 0.40000000000000002) 
(100, 10, 5, 'Decision Tree', 1.0, 0.65000000000000002) 
(100, 10, 5, 'Random Forest', 1.0, 0.90000000000000002) 
(100, 10, 5, 'AdaBoost', 1.0, 0.65000000000000002) 
(100, 10, 5, 'Naive Bayes', 1.0, 0.75) 
(100, 10, 5, 'Linear Discriminant Analysis', 1.0, 0.40000000000000002) 
(100, 10, 5, 'Quadratic Discriminant Analysis', 1.0, 0.84999999999999998) 
(100, 20, 5, 'Linear SVM', 1.0, 1.0) 
(100, 20, 5, 'Decision Tree', 0.94999999999999996, 0.94999999999999996) 
(100, 20, 5, 'Random Forest', 0.80000000000000004, 0.75) 
(100, 20, 5, 'AdaBoost', 1.0, 0.94999999999999996) 
(100, 20, 5, 'Naive Bayes', 1.0, 1.0) 
(100, 20, 5, 'Linear Discriminant Analysis', 1.0, 1.0) 
(100, 20, 5, 'Quadratic Discriminant Analysis', 0.84999999999999998, 0.94999999999999996) 
(100, 20, 10, 'Linear SVM', 0.94999999999999996, 0.65000000000000002) 
(100, 20, 10, 'Decision Tree', 0.94999999999999996, 0.59999999999999998) 
(100, 20, 10, 'Random Forest', 0.75, 0.69999999999999996) 
(100, 20, 10, 'AdaBoost', 0.94999999999999996, 0.69999999999999996) 
(100, 20, 10, 'Naive Bayes', 0.94999999999999996, 0.75) 

ma lo clf.fit() ora è single-threaded.

Supponendo che ho abbastanza thread per eseguire tutti i classificatori per ogni iterazione, Come dovrei essere in grado di formare i classificatori che utilizzano fili diversi per ogni iterazione del for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm)?

E se sono limitato a 4 o 8 thread ma devo allenare> 4 o> 8 classificatori per ciascuna iterazione, come è fatto?

risposta

3

Questo è meno di una risposta e più di uno schizzo di una risposta alla tua prima domanda,

Come vorrei essere in grado di formare i classificatori utilizzando diverse discussioni for every iteration of for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm)

Suppongo che con questo si intenda che per ogni iterazione di for name, clf, clf2 in zip(names, classifiers, classifiers2): si desidera clf e clf2 addestrato su processori diversi.

Ecco alcuni codice di lavoro come punto di partenza (è mal implementato, ma l'idea generale è lì):

from itertools import product 

import numpy as np 
import multiprocessing 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier 
from sklearn.naive_bayes import GaussianNB, MultinomialNB 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 

from sklearn.datasets import make_classification 
from sklearn.preprocessing import StandardScaler 
from sklearn.cross_validation import train_test_split 

names = ["Linear SVM", "Decision Tree", 
    "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis", 
    "Quadratic Discriminant Analysis"] 

# def mp_handler(): 
#  p = multiprocessing.Pool(8) 
#  p.map(mp_worker, data) 

def mp_worker((name, clf, X_train, y_train, X_test, y_test, num_features_to_remove)): 
    if num_features_to_remove == False: 
     clf.fit(X_train, y_train) 
     return ('score1', clf.score(X_test, y_test)) 

    clf.fit(X_train[:,:-num_feats_to_remove], y_train) 
    return ('score2', clf.score(X_test[:,:-num_feats_to_remove], y_test)) 

def griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove): 
    classifiers = [ 
     SVC(kernel="linear", C=0.025), 
     DecisionTreeClassifier(max_depth=5), 
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 
     AdaBoostClassifier(), GaussianNB(), 
     LinearDiscriminantAnalysis(), 
     QuadraticDiscriminantAnalysis()] 

    classifiers2 = [ 
     SVC(kernel="linear", C=0.025), 
     DecisionTreeClassifier(max_depth=5), 
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 
     AdaBoostClassifier(), GaussianNB(), 
     LinearDiscriminantAnalysis(), 
     QuadraticDiscriminantAnalysis()] 

    X, y = make_classification(n_samples=num_samples, n_features=num_feats, n_redundant=0, n_informative=2, 
          random_state=1, n_clusters_per_class=1) 
    X = StandardScaler().fit_transform(X) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) 



    for name, clf, clf2 in zip(names, classifiers, classifiers2): 

     p = multiprocessing.Pool(2) #set to 2 for using two processors; one processor per classfier 
     #The integer parameter you pass to Pool is equal to the number of SETS of classifiers you have 
     data = (name, clf, X_train, y_train, X_test, y_test, False), (name, clf, X_train, y_train, X_test, y_test, num_feats_to_remove) 
     res = p.map(mp_worker, data) #this splits the two classification tasks acrpss two separate processors 
     for i,j in res: #parse the results 
      if i == 'score1': 
       score1 = j 
      else: 
       score2 = j 

     yield (num_samples, num_feats, num_feats_to_remove, name, score1, score2) 

if __name__ == '__main__': 


    _samples = [100, 200] 
    _feats = [10, 20] 
    _feats_to_rm = [5, 10] 
    for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm): 
     if num_feats <= num_feats_to_remove: 
      continue 
     for i in griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove): 
      print (i) 

Se ho frainteso la tua domanda, allora il principio generale nel codice di cui sopra può essere modificato in base alle proprie esigenze. Traggo dalla risposta accettata here nel codice sopra.

+0

Che cos'è la doppia parentesi nella funzione mp_worker? – cgl