当前位置：首页 > news >正文

【sklearn】模型融合_投票法

news 来源：原创 2024/4/26 7:04:45

投票法

- 投票法类别
- 参数含义 sklearn.ensemble.VotingClassifier
- 1. 所需库, 数据
- 2. 定义交叉验证函数
- - 2.1 对单个评估器
  - 2.2 对融合模型
- 3. 基于交叉验证的benchmark
- 4. 融合多组分类器
- 5. 构建多样性
- - 5.1 多种多样性混合
  - 5.2 剔除不良算法
  - 5.3 尝试精简多样性
- 6. 分类器加权

投票法类别

'''
相对多数投票 少数服从多数
绝对多数投票 至少50%以上分类器输出相同,否则拒绝预测. 可衡量投票置信程度.

(分类模型输出结果为具体类别)
硬投票 预测类别出现次数最多作为结果 
(分类模型输出结果为概率,可用阈值、softmax转换为类别)
软投票 不同类别的预测概率加和,取高概率类别作为结果. 可衡量投票置信程度. 可能过拟合.

加权投票
    应用于硬投票 改变不同分类器拥有的票数
    应用于软投票 改变不同分类器所占的比重,概率加和->概率加权求和
    表现好的分类器若给过多的权重,易过拟合.
'''

参数含义 sklearn.ensemble.VotingClassifier

'''
class sklearn.ensemble.VotingClassifier(estimators, *, voting='hard', 
            weights=None, n_jobs=None, flatten_transform=True, verbose=False)

estimators 
    多个评估器及其名称,用列表打包
voting 投票方式
    默认 相对多数投票
    'hard'硬投票
    'soft'软投票,只接受可以输出概率值的算法
weights 权重
    默认 None
    用列表打包多个权重
flatten_transform 输出的概率结构
    软投票voting='soft'时使用
    True 输出结构为(n_samples, n_estimators*n_classes)
    False 输出结构为(n_samples, n_estimators, n_classes)
n_job 线程数
verbose 模型监控
'''

1. 所需库, 数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 数据
from sklearn.model_selection import KFold, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits # 分类手写数字数据集
from sklearn.datasets import load_boston

# 单一学习器
from sklearn.neighbors import KNeighborsClassifier as KNNC
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import LogisticRegression as LogiR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.naive_bayes import GaussianNB
import xgboost as xb # 只能用xgboost自带的sklearn api

# 融合模型
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingRegressor

data = load_digits()
X = data.data
y = data.target
print('X.shape', X.shape)
print('y.shape', y.shape)
print('十分类', np.unique(y))

'''
    X.shape (1797, 64)
    y.shape (1797,)
    十分类 [0 1 2 3 4 5 6 7 8 9]
'''

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

2. 定义交叉验证函数

2.1 对单个评估器

def individual_estimators(estimators):
    '''
    对模型融合的每个评估器做交叉验证,对单一评估器的表现进行评估
    '''
    for estimator in estimators:
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        result = cross_validate(estimator=estimator[1]
                               ,X=Xtrain
                               ,y=ytrain
                               ,scoring='accuracy'
                               ,n_jobs=-1
                               ,return_train_score=True
                               ,verbose=False
                               )
        test = estimator[1].fit(Xtrain, ytrain).score(Xtest, ytest)
        print(estimator[0]
             ,'\n train_score:{}'.format(result['train_score'].mean())
             ,'\n cv_mean:{}'.format(result['test_score'].mean())
             ,'\n test_score:{}'.format(test)
             ,'\n'
             )

2.2 对融合模型

def fusion_estimator(clf):
    '''
    对融合模型做交叉验证,对融合模型的表现进行评估
    '''
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    result = cross_validate(clf
                           ,X=Xtrain,y=ytrain
                           ,cv=cv
                           ,scoring='accuracy'
                           ,n_jobs=-1
                           ,return_train_score=True
                           ,verbose=False
                           )
    test = clf.fit(Xtrain, ytrain).score(Xtest, ytest)
    print(' train_score:{}'.format(result['train_score'].mean())
          ,'\n cv_mean:{}'.format(result['test_score'].mean())
          ,'\n test_score:{}'.format(test)
          )

3. 基于交叉验证的benchmark

'''
分数最高的单一算法, 进一步精调,作为benchmark
'''
logi = LogiR(max_iter=3000, n_jobs=-1) # 初始较大max_iter, 方便迭代到收敛
fusion_estimator(logi) # 过拟合,需要调整
'''
     train_score:1.0  cv_mean:0.9603343979868371 
     test_score:0.9722222222222222
'''

4. 融合多组分类器

第一组分类器: 放任自由，收敛为主，有过拟合风险

clf1 = LogiR(max_iter=3000, random_state=42, n_jobs=-1)
clf2 = RFC(n_estimators=100, random_state=42, n_jobs=-1)
# boosting 算法无法并行建树,无n_jobs
clf3 = GBC(n_estimators=100, random_state=42)

estimators = [('Logistic Regression', clf1)
             ,('RandomForest', clf2)
             ,('GBDT', clf3)
             ]
clf = VotingClassifier(estimators=estimators
                      ,voting='soft'
                      )

# 观察是否过拟合
individual_estimators(estimators)

'''
    Logistic Regression 
     train_score:1.0 
     cv_mean:0.956145954316686 
     test_score:0.9722222222222222 
    
    RandomForest 
     train_score:1.0 
     cv_mean:0.9735554587688734 
     test_score:0.9722222222222222 
    
    GBDT 
     train_score:1.0 
     cv_mean:0.951969608981804 
     test_score:0.9694444444444444
'''

# 融合后与benchmark效果对比
fusion_estimator(clf)
'''

     train_score:1.0 
     cv_mean:0.9784262485481998 
     test_score:0.9833333333333333
'''

第二组分类器: 轻微调参(非精细化调参), 限制过拟合

clf2.fit(Xtrain, ytrain)
for i in clf2.estimators_[: 10]:
    print(i.tree_.max_depth)
'''
    12
    14
    14
    14
    14
    15
    16
    13
    15
    13
'''

'''
限制过拟合参数
clf1逻辑回归 调小C
clf2随机森林 调小max_depth
clf3GBDT 特征抽样max_features='sqrt',每次建树只使用根号下特征数目的特征
         (max_depth对boosting算法没用)
'''
clf1 = LogiR(max_iter=3000, C=0.1, random_state=42, n_jobs=-1)
clf2 = RFC(n_estimators=100, max_depth=12, random_state=42, n_jobs=-1)
clf3 = GBC(n_estimators=100, max_features='sqrt', random_state=42)

estimators = [('Logistic Regression', clf1)
             ,('RandomForest', clf2)
             ,('GBDT', clf3)
             ]
clf = VotingClassifier(estimators=estimators
                      ,voting='soft'
                      )

'''
判断各参数是否适合
'''
individual_estimators(estimators)
'''
    Logistic Regression 
     train_score:0.9998259355961705 
     cv_mean:0.9596205962059621 
     test_score:0.9722222222222222 
    
    RandomForest 
     train_score:1.0 
     cv_mean:0.9728585946573751 
     test_score:0.9722222222222222 
    
    GBDT 
     train_score:1.0 
     cv_mean:0.9686749903213319 
     test_score:0.9777777777777777
'''

# 基本不过拟合了
fusion_estimator(clf)
'''
     train_score:1.0 
     cv_mean:0.9770325203252032 
     test_score:0.9777777777777777
'''

5. 构建多样性

'''
评估器之间差别越大,彼此间越独立
1. 训练数据多样性
    多组特征工程,不同特征矩阵训练不同模型.效果好.
2. 样本多样性
    相同特征矩阵,不同样本子集进行训练.数据量不宜过小,模型效果易下降.
3. 特征多样性
    相同特征矩阵,不同特征子集进行训练.特征量不宜过小,模型效果易下降.
    clf3GBDT 特征抽样max_features='sqrt'
4. 随机多样性/训练多样性
    相同算法,不同随机种子(不同特征、样本、起点)、不同损失函数、不同纯度下降量
    相当于bagging集成
5. 算法多样性
    增加模型类型, 集成、树、概率、线性混合,模型效果不能太差.
'''

5.1 多种多样性混合

# 逻辑回归没有增加多样性的选项,只能随机多样性
clf1 = LogiR(max_iter=3000, C=0.1, random_state=1412, n_jobs=-1)
# 随机森林增加特征多样性和样本多样性
# max_features特征抽样, max_samples样本抽样
clf2 = RFC(n_estimators=100, max_features='sqrt', max_samples=0.9, random_state=1412, n_jobs=-1)
# 梯度提升树增加特征多样性,微调特征数量
#  max_features特征抽样, subsample样本抽样
clf3 = GBC(n_estimators=100, max_features=16, random_state=1412)

# 增加算法多样性, 增加决策树、knn、贝叶斯. 拖后腿的不能用
clf4 = DTC(max_depth=8, random_state=1412)
clf5 = KNNC(n_neighbors=10, n_jobs=8)
clf6 = GaussianNB()

# 增加随即多样性, 相同算法更换随机数种子
clf7 = RFC(n_estimators=100, max_features='sqrt', max_samples=0.9, random_state=1234, n_jobs=-1)
clf8 = GBC(n_estimators=100, max_features=16, random_state=1234)

estimators = [('Logistic Regression', clf1)
             ,('RandomForest', clf2)
             ,('GBDT', clf3)
             ,('Decision Tree', clf4)
             ,('KNN', clf5)
             ,('Bayes', clf6)
             ,('RandomForest2', clf7)
             ,('GBDT2', clf8)
             ]
clf = VotingClassifier(estimators=estimators
                      ,voting='soft'
                      )

individual_estimators(estimators)
'''
决策树过拟合, max_depth再小影响交叉验证分数, 无法提升效果
KNN效果很好
Bayes过拟合不严重, 但效果很差, 在训练集上显示学习能力不强
'''

'''
    Logistic Regression 
     train_score:0.9998259355961705 
     cv_mean:0.9596205962059621 
     test_score:0.9722222222222222 
    
    RandomForest 
     train_score:1.0 
     cv_mean:0.9763332365466513 
     test_score:0.9777777777777777 
    
    GBDT 
     train_score:1.0 
     cv_mean:0.968672570654278 
     test_score:0.975 
    
    Decision Tree 
     train_score:0.9363260301963902 
     cv_mean:0.8301901858304298 
     test_score:0.85 
    
    KNN 
     train_score:0.9824278200325425 
     cv_mean:0.9742523228803716 
     test_score:0.9833333333333333 
    
    Bayes 
     train_score:0.8590824535512922 
     cv_mean:0.8295489740611692 
     test_score:0.8472222222222222 
    
    RandomForest2 
     train_score:1.0 
     cv_mean:0.9700783972125435 
     test_score:0.9694444444444444 
    
    GBDT2 
     train_score:1.0 
     cv_mean:0.9672812620983352 
     test_score:0.9722222222222222 
'''

fusion_estimator(clf)
'''
和第一组分类器效果相似,需要删掉不好的模型
'''
'''
     train_score:1.0 
     cv_mean:0.974934668989547 
     test_score:0.9805555555555555
'''

5.2 剔除不良算法

estimators = [('Logistic Regression', clf1)
             ,('RandomForest', clf2)
             ,('GBDT', clf3)
             ,('Decision Tree', clf4)
             ,('KNN', clf5)
             #,('Bayes', clf6)
             ,('RandomForest2', clf7)
             ,('GBDT2', clf8)
             ]
clf = VotingClassifier(estimators=estimators
                      ,voting='soft'
                      )

fusion_estimator(clf)
'''
效果提升
'''
'''
     train_score:1.0 
     cv_mean:0.9812040263259775 
     test_score:0.9833333333333333
'''

5.3 尝试精简多样性

# 随机森林和梯度提升树耗时长
estimators = [('Logistic Regression', clf1)
             ,('RandomForest', clf2)
             ,('GBDT', clf3)
             ,('Decision Tree', clf4)
             ,('KNN', clf5)
             #,('Bayes', clf6)
             #,('RandomForest2', clf7)
             #,('GBDT2', clf8)
             ]
clf = VotingClassifier(estimators=estimators
                      ,voting='soft'
                      )
fusion_estimator(clf)
'''
效果差不多
希望在测试集上效果更好则用test_score值高的融合模型
'''
'''

     train_score:1.0 
     cv_mean:0.9819008904374759 
     test_score:0.9805555555555555
'''

6. 分类器加权

'''
依赖余=于多次尝试
'''

第一种选项: 使用各模型交叉验证结果作为权重, 有过拟合风险

estimators = [('Logistic Regression', clf1)
             ,('RandomForest', clf2)
             ,('GBDT', clf3)
             ,('Decision Tree', clf4)
             ,('KNN', clf5)]
# 精准权重
clf_weighted = VotingClassifier(estimators=estimators
                               ,voting='soft'
                               ,weights=[0.9722, 0.9777, 0.9750, 0.85, 0.9833]
                               )
fusion_estimator(clf_weighted)
'''
没有出现严重过拟合
'''
'''
     train_score:1.0 
     cv_mean:0.9825953348819201 
     test_score:0.9805555555555555
'''

第二种选项: 稍微降低权重精度,或许一定程度上抵消过拟合

# 模糊权重
# 多给效果好的KNN权重
clf_weighted = VotingClassifier(estimators=estimators
                               ,voting='soft'
                               ,weights=[0.96, 0.96, 0.96, 0.85, 0.99]
                               )
fusion_estimator(clf_weighted)
'''
权重精确和粗略对效果影响不大
'''
'''
     train_score:1.0 
     cv_mean:0.9825953348819201 
     test_score:0.9805555555555555
'''

第三种选项: 加大效果好的算法的权重,减小效果差的算法的权重

clf_weighted = VotingClassifier(estimators=estimators
                               ,voting='soft'
                               ,weights=[0.96, 0.96, 0.96, 0.85, 1.3]
                               )
fusion_estimator(clf_weighted)
'''
若过拟合,不能加大效果好的算法的权重,可降低效果差的算法的权重
降低0.85
'''
'''
     train_score:0.9998259355961705 
     cv_mean:0.9825929152148664 
     test_score:0.9805555555555555
'''

clf_weighted = VotingClassifier(estimators=estimators
                               ,voting='soft'
                               ,weights=[0.96, 0.96, 0.96, 0.3, 1.3]
                               )
fusion_estimator(clf_weighted)
'''
这个情况看来努力降低不好的提升好的效果更好
'''
'''
     train_score:1.0 
     cv_mean:0.9832897793263647 
     test_score:0.9833333333333333
'''