import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
# 导入手写数据集
digits = datasets. load_digits()
images_and_labels = list(zip(digits. images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
    plt.subplot (2, 4, index + 1)
    plt. imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)

n_samples = len(digits. images)
data = digits. images.reshape((n_samples, -1))
# 创建一个分类器:支持向量机分类器
classifier = svm.SVC(gamma=0.001)
# 采用数据的前半段数据用来训练分类器模型
classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
# 预测另外一半数据
expected = digits.target[n_samples // 2:]
predicted= classifier.predict(data[n_samples // 2:])

print("Classification report for classifier %s:\n%s\n"
    % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s"% metrics.confusion_matrix(expected, predicted))

images_and_predictions = list(zip(digits.images[n_samples // 2:],predicted))
for index,(image,prediction) in enumerate(images_and_predictions[:4]):
        plt. subplot (2, 4, index + 5)
        plt. imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        plt.title('Prediction: %i' % prediction)
plt. show()
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        88
           1       0.99      0.97      0.98        91
           2       0.99      0.99      0.99        86
           3       0.98      0.87      0.92        91
           4       0.99      0.96      0.97        92
           5       0.95      0.97      0.96        91
           6       0.99      0.99      0.99        91
           7       0.96      0.99      0.97        89
           8       0.94      1.00      0.97        88
           9       0.93      0.98      0.95        92

   micro avg       0.97      0.97      0.97       899
   macro avg       0.97      0.97      0.97       899
weighted avg       0.97      0.97      0.97       899

Confusion matrix:
[[87  0  0  0  1  0  0  0  0  0]
 [ 0 88  1  0  0  0  0  0  1  1]
 [ 0  0 85  1  0  0  0  0  0  0]
 [ 0  0  0 79  0  3  0  4  5  0]
 [ 0  0  0  0 88  0  0  0  0  4]
 [ 0  0  0  0  0 88  1  0  0  2]
 [ 0  1  0  0  0  0 90  0  0  0]
 [ 0  0  0  0  0  1  0 88  0  0]
 [ 0  0  0  0  0  0  0  0 88  0]
 [ 0  0  0  1  0  1  0  0  0 90]]


%matplotlib inline
import pandas as pd
def opencsv () : # open with pandas
    data = pd.read_csv('data/digit_train.csv')
    datal = pd.read_csv('data/digit_test.csv')
    train_data = data.values[0:,1:] 
    train_label = data.values[0:,0]
    test_data = datal.values[0:,0:]
    print('Data Load Done!')
    return train_data, train_label, test_data
train_data, train_label, test_data = opencsv()

# Train_data 中存储了训练集的784个特征,Test_data存储了测试集的784个特征,train_lable则存储了训练集的标签
# 可以看出这道题是典型的监督学习问题
Data Load Done!
import matplotlib.pyplot as plt
from numpy import*
print(shape(train_data),shape(test_data))
def showPic(data):
    plt.figure(figsize=(12, 12))

for digit_num in range(0, 70):
    plt. subplot(7, 10,digit_num+l)
    grid_data = data[digit_num].reshape(28,28) # reshape from id to 2d pixel array
    plt. imshow(grid_data, interpolation = "none", cmap = "afmhot")
(42000, 784) (28000, 784)
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0 188 255  94   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0 191 250 253  93   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
  123 248 253 167  10   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  80
  247 253 208  13   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  29 207
  253 235  77   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  54 209 253
  253  88   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0  93 254 253 238
  170  17   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0  23 210 254 253 159
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0  16 209 253 254 240  81
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0  27 253 253 254  13   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  20 206 254 254 198   7   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0 168 253 253 196   7   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0  20 203 253 248  76   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  22 188 253 245  93   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 103 253 253 191   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0  89 240 253 195  25   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  15 220 253 253  80   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  94 253 253 253  94   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  89 251 253 250 131   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0 214 218  95   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]


从图和数值上看出,值在0-255范围变化,即每个特征都是连续的值,想想这样连续的值对我们后期的特征选择是重要还是不重要的? 我们观察发现,在0与>0的边界,其值都不很高,(有种写字笔水在纸张上晕开了的感觉?)


def DataClean(data,epsilon): # normalize data
    m, n = shape(data)
    ret = zeros((m,n))
    for i in range(m):
        for j in range(n):
            if data[i, j] > epsilon:
                ret [i, j] = 1
                ret[i, j] = 0
    return ret
from sklearn import svm
from datetime import datetime
from sklearn.model_selection import cross_val_score
start= datetime.now()
model = svm.SVC(kernel='rbf', C=10)
metric = cross_val_score(model, train_data,train_label,cv=5,scoring='accuracy').mean()
end= datetime.now()
print('CV use: %f' %((end-start). seconds))
print('Offline Accuracy is' % metric)



  • Principal Component Analysis ( PCA ) - Unsupervised, linear method

  • Linear Discriminant Analysis (LDA) - Supervised, linear method

  • t-distributed Stochastic Neighbour Embedding (t-SNE) - Nonlinear, probabilistic method


from sklearn.decomposition import PCA
def getncomponent(inputdata):
    pca = PCA()
    EV_List = pca.explained_variance_
    EVR_List = []
    for j in range(len(EV_List)):
    for j in range(len(EVR_List)):
        if(EVR_List[j]< 0.10):
            print('Recommend %d:' %j)
            return j


比如knn 这个太慢了我就不尝试了
比如Random Forest
比如Decision Tree

pca = PCA(n_components=22,whiten=True)
train_x = pca.fit_transform(train_data)
from datetime import datetime
def modeltest(train_x,train_label,model):
    start = datetime.now()
    metric = cross_val_score(model,train_x,train_label,cv=5,scoring='accuracy').mean()
    end = datetime.now()
    print('CV use: %f' %((end-start).seconds))
    print('Offline Accuracy is %f' % (metric))
from sklearn import svm
from sklearn.model_selection import cross_val_score
SVM_model = svm.SVC(kernel='rbf', C=10)

from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression()

from sklearn.ensemble import RandomForestClassifier
RF_model = RandomForestClassifier(n_estimators=100)
CV use: 49.000000
Offline Accuracy is 0.978452
CV use: 15.000000
Offline Accuracy is 0.865500
CV use: 73.000000
Offline Accuracy is 0.945143


pea = PCA(n_components=22,whiten=True)
train_x =pca.fit_transform(train_data)
test_x = pca.transform(test_data)
resultname = 'PCA_SVM'
start = datetime.now()
end = datetime.now()
print('train time used:%f' % (end-start).seconds)
test_y = SVM_model.predict(test_x)
end = datetime.now()
print('predict time used:%f' % (end-start). seconds)
pred = [[index+ 1, x] for index, x in enumerate(test_y)]
savetxt(resultname+' .csv', pred, delimiter=',', fmt='%d,%d', header='Imageld,Label' ,comments='')
