声明
本文代码均保存在
https://github.com/super-213/business_data_analysis
有需要的可以自行下载
查看数据
1 2 3 4
| import os names = os.listdir('olivettifaces')
names[0:5]
|
1 2 3
| from PIL import Image img0 = Image.open('olivettifaces/' + names[0]) img0.show()
|
1 2 3 4 5 6
| import numpy as np img0 = img0.convert('L') img0 = img0.resize((32, 32)) arr = np.array(img0)
arr
|
array([[186, 76, 73, …, 100, 103, 106],
[196, 85, 68, …, 85, 106, 103],
[193, 69, 79, …, 82, 99, 100],
…,
[196, 87, 193, …, 103, 66, 52],
[219, 179, 202, …, 150, 127, 109],
[244, 228, 230, …, 198, 202, 206]], dtype=uint8)
1 2
| import pandas as pd pd.DataFrame(arr)
|
1 2 3
| arr = arr.reshape(1, -1)
print(arr)
|
[[186 76 73 … 198 202 206]]
1
| print(arr.flatten().tolist())
|
[186, 76, 73, 87, 89…]
1 2 3 4 5 6 7 8 9 10
| X = [] for i in names: if i.startswith("._"): continue img = Image.open('olivettifaces/' + i) img = img.convert('L') img = img.resize((32, 32)) arr = np.array(img) X.append(arr.reshape(1, -1).flatten().tolist()) print(i)
|
1 2 3 4
| import pandas as pd X = pd.DataFrame(X)
X
|
(400, 1024)
1
| print(int(names[0].split('_')[0]))
|
10
1 2 3 4 5 6 7 8
| y = [] for i in names: if i.startswith("._"): continue img = Image.open('olivettifaces/' + i) y.append(int(i.split('_')[0])) print(y)
|
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11…]
数据划分
1 2
| from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
|
PCA降维
1 2 3 4 5 6 7 8 9
| from sklearn.decomposition import PCA pca = PCA(n_components=100) pca.fit(X_train)
X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test)
print(X_train_pca.shape) print(X_test_pca.shape)
|
(320, 100)
(80, 100)
KNN
1 2 3 4 5 6 7 8 9 10 11
| from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier() knn.fit(X_train_pca, y_train)
y_pred = knn.predict(X_test_pca) from sklearn.metrics import accuracy_score print(accuracy_score(y_test, y_pred))
score = knn.score(X_test_pca, y_test) print(score)
|
0.9125
0.9125
随机森林
1 2 3 4 5 6 7 8 9 10
| from sklearn.ensemble import RandomForestClassifier
rm = RandomForestClassifier(random_state=42) rm.fit(X_train, y_train) from sklearn.metrics import classification_report, confusion_matrix
y_pred = rm.predict(X_test) print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
|
0.975
LGBM
1 2 3 4 5 6 7 8 9
| from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=42) lgbm.fit(X_train, y_train) y_pred = lgbm.predict(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
|
0.8875
贝叶斯
1 2 3 4 5 6 7 8 9
| from sklearn.naive_bayes import GaussianNB
nb = GaussianNB() nb.fit(X_train, y_train) y_pred = nb.predict(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
|
0.925
SVM
1 2 3 4 5 6 7 8 9
| from sklearn.svm import SVC
svm = SVC(random_state=42) svm.fit(X_train, y_train) y_pred = svm.predict(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
|
0.95
模型横向对比
| 模型 |
特点 |
是否降维 |
测试准确率 |
优点 |
缺点 |
| KNN |
基于邻近样本投票 |
✔ (PCA → 100维) |
0.9125 |
简单直观,效果较好 |
预测时计算量大,对高维数据敏感 |
| 随机森林 |
多棵决策树投票 |
✘ (原始 1024维) |
0.9750 |
精度高,抗过拟合,特征重要性可解释 |
训练速度较慢,模型体积较大 |
| LightGBM |
基于梯度提升的树模型 |
✘ (原始 1024维) |
0.8875 |
训练预测快,适合大规模数据 |
在小数据集上易过拟合或欠拟合 |
| 朴素贝叶斯 (GaussianNB) |
基于概率的生成模型 |
✘ |
0.9250 |
简单快速,鲁棒性好 |
特征独立性假设过强,可能丢失信息 |
| SVM (RBF kernel) |
最大化间隔的判别模型 |
✘ |
0.9500 |
对高维小样本表现好,分类边界清晰 |
参数调优敏感,计算复杂度高 |
综合结论
- 最优结果:随机森林 (97.5%) 在未降维的高维特征下表现最好。
- 次优结果:SVM (95%) 适合这种小样本高维特征的分类。
- 轻量方案:朴素贝叶斯 (92.5%) 简单快速,适合 baseline。
- KNN (91.25%) 降维后效果不错,但预测效率差。
- LightGBM (88.75%) 对小样本不够稳定,不如 RF/SVM。
- 推荐 随机森林 / SVM,它们在小样本情况下表现最稳。
- 如果需要实时性和轻量,可以考虑 朴素贝叶斯 + PCA。