import mitosheet
import numpy as np
import pandas as pd
import plotly as py
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

cf.set_config_file(
    offline=True, 
    world_readable=True, 
    theme='white',        # 设置绘图风格
)

import warnings
warnings.filterwarnings("ignore")

import sklearn
import graphviz
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

from colorama import Fore
def color(text):
    return Fore.RED + text + Fore.RESET


file_path = './附件2(Attachment 2)2022-51MCM-Problem B.xlsx'
sheet1 = pd.read_excel(
    io=file_path, 
    index_col=None, 
    sheet_name='温度(temperature)', )
sheet2 = pd.read_excel(
    io=file_path, 
    index_col=None, 
    sheet_name='产品质量(quality of the products)', )
sheet3 = pd.read_excel(
    io=file_path, 
    index_col=None, 
    sheet_name='原矿参数(mineral parameter)', )
sheet4 = pd.read_excel(
    io=file_path, 
    index_col=None, 
    sheet_name='过程数据(process parameter)', )


# mitosheet.sheet(sheet1, sheet2, sheet3, sheet4, analysis_to_replay="id-tfyraljimv")


sheet1.iplot(x='时间 (Time)')


# todo 找到有效的温度数据
sheet1_time_string = sheet1.iloc[:, 0].astype('string')

cond1 = sheet1_time_string.apply(lambda x: x[14: 16]) == "50"
data_part1 = sheet1[cond1].iloc[:-2, :]

exp_date1 = [
    "2022-02-03 20:50:00", 
    "2022-02-26 13:50:00", 
    "2022-03-21 06:50:00", 
    "2022-04-04 10:50:00", "2022-04-04 15:50:00", 
    "2022-03-10 10:50:00", "2022-03-10 11:50:00", "2022-03-10 12:50:00", 
]
cond1 = sheet1_time_string.apply(lambda x: x in exp_date1)
data_part1 = data_part1[cond1.apply(lambda x: not x)]
data_part1.index = [i for i in range(len(data_part1))]
print(data_part1.shape)
# data_part1
# mitosheet.sheet(data_part1, analysis_to_replay="id-conydfcblv")

(1725, 3)


# todo 找到有效的产品质量数据
sheet2_time_string = sheet2.iloc[:, 0].astype('string')

exp_date2 = [
    "2022-02-20 23:50:00", "2022-02-21 00:50:00", "2022-02-21 01:50:00", 
    
    "2022-02-21 09:50:00", "2022-02-21 10:50:00", "2022-02-21 02:50:00", "2022-02-21 03:50:00", "2022-02-21 04:50:00", 
    "2022-02-21 05:50:00", "2022-02-21 06:50:00", "2022-02-21 07:50:00", "2022-02-21 08:50:00", 
    
    "2022-02-26 06:50:00", "2022-02-26 07:50:00", "2022-02-26 08:50:00", "2022-02-26 09:50:00", "2022-02-26 10:50:00",
    
    "2022-04-08 00:50:00", "2022-04-08 01:50:00", 
]

cond2 = sheet2_time_string.apply(lambda x: x in exp_date2)
data_part2_ = sheet2[cond2.apply(lambda x: not x)].iloc[2:, :]
data_part2_.index = [i for i in range(len(data_part2_))]
print(data_part2_.shape)
# data_part2_
# mitosheet.sheet(data_part2_, analysis_to_replay="id-etdktbfykz")

(1725, 5)


data_part2_


# todo 计算合格数量、合格率
cond10 = 77.78 < data_part2_.iloc[:, 1]
cond11 = data_part2_.iloc[:, 1] < 80.33
cond2 = data_part2_.iloc[:, 2] < 24.15
cond3 = data_part2_.iloc[:, 3] < 17.15
cond4 = data_part2_.iloc[:, 4] < 15.62
# print("合格率：", len(data_part2_[cond10][cond11][cond2][cond3][cond4]) / len(data_part2_))

# todo 找到产品是否合格
def is_qualified(x):
    return 77.78 < x[1] < 80.33 and x[2] < 24.15 and x[3] < 17.15 and x[4] < 15.62
data_part2 = pd.DataFrame(data_part2_.apply(is_qualified, axis=1))
data_part2.columns = ['是否合格']
# print(len(data_part2))
# print(data_part2.sum() / len(data_part2))

print(data_part2.shape)
print(data_part2.sum(), data_part2.sum() / len(data_part2))
# data_part2
# mitosheet.sheet(data_part2, analysis_to_replay="id-mkgwgrqoel")

(1725, 1)
是否合格    472
dtype: int64 是否合格    0.273623
dtype: float64


# todo 找到原矿参数数据 3
cnt = data_part1.iloc[:, 0].astype('string').apply(lambda x: x[5: 10])
time_cnt = []
for i in pd.DataFrame(cnt).groupby(by='时间 (Time)'):
    time_cnt.append(len(i[1]))
data_part3 = pd.DataFrame(np.repeat(sheet3.iloc[:-4, :].values, time_cnt, axis=0), columns=sheet3.columns)
print(data_part3.shape)
# data_part3
# mitosheet.sheet(data_part3, analysis_to_replay="id-rvruqimmlr")

(1725, 5)


cols = ['时间 (Time)', "过程数据3 (Process parameter 3)", "过程数据4 (Process parameter 4)"]
proc_data = pd.DataFrame(sheet4)
proc_data.iplot(x='时间 (Time)')
print("相关系数：")
proc_data.iloc[:, 1:].corr()

相关系数：


def norm(data):
    return (data - data.min()) / (data.max() - data.min())

data_part4 = sheet4.apply(lambda x: (x[3] + x[4]), axis=1)
data_part4 = pd.concat([sheet4.iloc[:, 0], data_part4], axis=1).rename(columns={0: "原矿质量"})
print(data_part4.shape)
# data_part4
# mitosheet.sheet(data_part4, analysis_to_replay="id-lntnexsmmk")

(619, 2)


exp_date4 = []
for i in exp_date1 + exp_date2:
    exp_date4.append(i[:-5] + "30")
# print(exp_date4)

sheet4_time_string = data_part4.iloc[:, 0].astype('string')
cond4 = sheet4_time_string.apply(lambda x: x[: -3] not in exp_date4)

data_part4_need = data_part4[cond4]
data_part4_need = data_part4_need.iloc[:-33, :]

for _ in range(5):
    data_part4_need.drop(index=np.random.randint(0, len(data_part4_need)), inplace=True)
data_part4_need.index = [i for i in range(len(data_part4_need))]

data_part4_need = pd.DataFrame(np.repeat(data_part4_need.values, 3, axis=0), columns=data_part4_need.columns)
print(data_part4_need.shape)
# data_part4_need
# mitosheet.sheet(data_part4_need, analysis_to_replay="id-owygulbcev")

(1725, 2)


# mitosheet.sheet(data_part1, data_part2, data_part3, data_part4_need, analysis_to_replay="id-gwrmcdoymx")


data_part4_need


X = pd.concat([data_part1.iloc[:, 1:], data_part3.iloc[:, 1:], data_part4_need.iloc[:, 1:]], axis=1)
Ys = data_part2


# mitosheet.sheet(X, Ys, analysis_to_replay="id-qrekqeyfua")


X.to_csv("quention3-X_data.csv")
Ys.to_csv("quention3-Y_data.csv")


cond = (pd.notna(X).iloc[:, 0] == True)
remain_index = X[cond].index


X = X[cond]
Y = Ys[cond].replace(to_replace=[True, False], value=[1, 0])
print(X.shape, Y.shape)

(1640, 7) (1640, 1)


# mitosheet.sheet(X, Y, analysis_to_replay="id-kzkchyyfcx")


# 04-08 04-09 处理即将预测的数据
data_to_predict = np.array(
    [[341.40, 665.04, 52.88, 91.27, 47.22, 22.26, ],     # 8
     [1010.32, 874.47, 54.44, 92.12, 48.85, 21.83, ],],  # 8
)
data_to_predict = np.repeat(data_to_predict, [8, 9], axis=0)
data_to_predict = np.concatenate([data_to_predict, np.array(data_part4.iloc[586:586 + 17:, 1:])], axis=1)
# mitosheet.sheet(pd.DataFrame(data_to_predict), analysis_to_replay="id-edacdozavl")


feature_name = list(X.columns)
feature_name

['系统I温度 (Temperature of system I)',
 '系统II温度 (Temperature of system II)',
 '原矿参数1 (Mineral parameter 1)',
 '原矿参数2 (Mineral parameter 2)',
 '原矿参数3 (Mineral parameter 3)',
 '原矿参数4 (Mineral parameter 4)',
 '原矿质量']

X


from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import LogisticRegressionCV as LRCV

test_size = 0.3

models_lr = [
    LR(
        penalty="l2", 
        C=1.0, 
        random_state=None, 
        solver="lbfgs", 
        max_iter=3000,
        multi_class='ovr', 
        verbose=0,
    ), 
    LRCV(
        penalty="l2", 
        random_state=None, 
        solver="lbfgs", 
        max_iter=3000,
        multi_class='ovr', 
        verbose=0,
    ), 
]

for model in models_lr:
    print("模型：", model)
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=test_size, random_state=10, shuffle=True)

    # todo 训练训练集
    model.fit(xtrain, ytrain)
    # todo 预测测试集
    yhat = model.predict(xtest)

    # 主要分类指标的文本报告
    print('主要分类指标的文本报告:')
    print(classification_report(ytest, yhat))
    # auc
    print('AUC:', roc_auc_score(ytest, yhat))
    
    # todo 预测
    pred = model.predict(data_to_predict)
    print("预测结果：", pred[:8])
    print(color(f"准确率：{pred[:8].sum() / len(pred[:8])}"),  )
    print("预测结果：", pred[8:])
    print(color(f"准确率：{pred[8:].sum() / len(pred[8:])}"),  )
    print('-'*100)

模型： LogisticRegression(max_iter=3000, multi_class='ovr')
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.73      0.97      0.83       355
           1       0.44      0.05      0.09       137

    accuracy                           0.72       492
   macro avg       0.58      0.51      0.46       492
weighted avg       0.65      0.72      0.63       492

AUC: 0.5128713889174463
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 0 0 0 0 0 0]
准确率：0.0
----------------------------------------------------------------------------------------------------
模型： LogisticRegressionCV(max_iter=3000, multi_class='ovr')
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.73      0.97      0.83       355
           1       0.40      0.04      0.08       137

    accuracy                           0.72       492
   macro avg       0.56      0.51      0.46       492
weighted avg       0.63      0.72      0.62       492

AUC: 0.5092217538809499
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 0 0 0 0 0 0]
准确率：0.0
----------------------------------------------------------------------------------------------------


from sklearn.tree import ExtraTreeClassifier as ETC, DecisionTreeClassifier as DTC

model_dt = [
    ETC(), 
    DTC(), 
]
for model in model_dt:
    print("模型：", model)
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=test_size, random_state=10, shuffle=True)

    # todo 训练训练集
    model.fit(xtrain, ytrain)
    # todo 预测测试集
    yhat = model.predict(xtest)

    # 主要分类指标的文本报告
    print('主要分类指标的文本报告:')
    print(classification_report(ytest, yhat))
    # auc
    print('AUC:', roc_auc_score(ytest, yhat))
    
    # todo 预测
    pred = model.predict(data_to_predict)
    print("预测结果：", pred[:8])
    print(color(f"准确率：{pred[:8].sum() / len(pred[:8])}"),  )
    print("预测结果：", pred[8:])
    print(color(f"准确率：{pred[8:].sum() / len(pred[8:])}"),  )
    print('-'*100)

模型： ExtraTreeClassifier()
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       355
           1       0.55      0.53      0.54       137

    accuracy                           0.75       492
   macro avg       0.69      0.68      0.68       492
weighted avg       0.75      0.75      0.75       492

AUC: 0.6819163154107125
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 1 1 0 0 0 0]
准确率：0.2222222222222222
----------------------------------------------------------------------------------------------------
模型： DecisionTreeClassifier()
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81       355
           1       0.51      0.53      0.52       137

    accuracy                           0.73       492
   macro avg       0.66      0.66      0.66       492
weighted avg       0.73      0.73      0.73       492

AUC: 0.6641821733319626
预测结果： [1 1 1 1 1 1 1 1]
准确率：1.0
预测结果： [0 0 0 1 1 1 1 0 0]
准确率：0.4444444444444444
----------------------------------------------------------------------------------------------------


from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import BaggingClassifier as BC
from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier as RFC

model_rf = [
    ABC(), 
    BC(), 
    ETC(), 
    GBC(), 
    RFC(), 
]
for model in model_rf:
    print("模型：", model)
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=test_size, random_state=10, shuffle=True)

    # todo 训练训练集
    model.fit(xtrain, ytrain)
    # todo 预测测试集
    yhat = model.predict(xtest)

    # 主要分类指标的文本报告
    print('主要分类指标的文本报告:')
    print(classification_report(ytest, yhat))
    # auc
    print('AUC:', roc_auc_score(ytest, yhat))
    
    # todo 预测
    pred = model.predict(data_to_predict)
    print("预测结果：", pred[:8])
    print(color(f"准确率：{pred[:8].sum() / len(pred[:8])}"),  )
    print("预测结果：", pred[8:])
    print(color(f"准确率：{pred[8:].sum() / len(pred[8:])}"),  )
    print('-'*100)

模型： AdaBoostClassifier()
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.76      0.90      0.83       355
           1       0.53      0.28      0.36       137

    accuracy                           0.73       492
   macro avg       0.65      0.59      0.60       492
weighted avg       0.70      0.73      0.70       492

AUC: 0.5907988074431992
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 0 0 0 0 0 0]
准确率：0.0
----------------------------------------------------------------------------------------------------
模型： BaggingClassifier()
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       355
           1       0.64      0.44      0.52       137

    accuracy                           0.77       492
   macro avg       0.72      0.67      0.69       492
weighted avg       0.76      0.77      0.76       492

AUC: 0.671090778246119
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 0 0 0 0 0 0]
准确率：0.0
----------------------------------------------------------------------------------------------------
模型： ExtraTreesClassifier()
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       355
           1       0.62      0.56      0.59       137

    accuracy                           0.78       492
   macro avg       0.73      0.71      0.72       492
weighted avg       0.78      0.78      0.78       492

AUC: 0.7134162640074021
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 0 0 0 0 0 0]
准确率：0.0
----------------------------------------------------------------------------------------------------
模型： GradientBoostingClassifier()
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       355
           1       0.60      0.34      0.43       137

    accuracy                           0.75       492
   macro avg       0.69      0.62      0.64       492
weighted avg       0.73      0.75      0.73       492

AUC: 0.6242212398478463
预测结果： [1 1 1 0 0 0 0 0]
准确率：0.375
预测结果： [0 0 0 0 0 0 0 0 0]
准确率：0.0
----------------------------------------------------------------------------------------------------
模型： RandomForestClassifier()
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       355
           1       0.63      0.50      0.56       137

    accuracy                           0.78       492
   macro avg       0.73      0.70      0.71       492
weighted avg       0.77      0.78      0.77       492

AUC: 0.6954867893492342
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 0 0 0 0 0 0]
准确率：0.0
----------------------------------------------------------------------------------------------------


from xgboost import XGBClassifier as XGBC
from xgboost import XGBRFClassifier as XGBRFC

models_xgb = [
    XGBC(), 
    XGBRFC(), 
]
for model in models_xgb:
    print("模型：", model)
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=test_size, random_state=10, shuffle=True)

    # todo 训练训练集
    model.fit(xtrain.values, ytrain)
    # todo 预测测试集
    yhat = model.predict(xtest.values)

    # 主要分类指标的文本报告
    print('主要分类指标的文本报告:')
    print(classification_report(ytest, yhat))

    # auc
    print('AUC:', roc_auc_score(ytest, yhat))
    
    # todo 预测
    pred = model.predict(data_to_predict)
    print("预测结果：", pred[:8])
    print(color(f"准确率：{pred[:8].sum() / len(pred[:8])}"),  )
    print("预测结果：", pred[8:])
    print(color(f"准确率：{pred[8:].sum() / len(pred[8:])}"),  )
    print('-'*100)

模型： XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)
[18:35:41] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       355
           1       0.58      0.52      0.55       137

    accuracy                           0.76       492
   macro avg       0.70      0.69      0.69       492
weighted avg       0.75      0.76      0.76       492

AUC: 0.6858846509715225
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 0 1 0 1 0 0]
准确率：0.2222222222222222
----------------------------------------------------------------------------------------------------
模型： XGBRFClassifier(base_score=None, booster=None, colsample_bylevel=None,
                colsample_bytree=None, gamma=None, gpu_id=None,
                importance_type='gain', interaction_constraints=None,
                max_delta_step=None, max_depth=None, min_child_weight=None,
                missing=nan, monotone_constraints=None, n_estimators=100,
                n_jobs=None, num_parallel_tree=None,
                objective='binary:logistic', random_state=None, reg_alpha=None,
                scale_pos_weight=None, tree_method=None,
                validate_parameters=None, verbosity=None)
[18:35:41] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
主要分类指标的文本报告:
              precision    recall  f1-score   support

           0       0.78      0.94      0.85       355
           1       0.67      0.32      0.43       137

    accuracy                           0.77       492
   macro avg       0.72      0.63      0.64       492
weighted avg       0.75      0.77      0.74       492

AUC: 0.6295980261128816
预测结果： [0 0 0 0 0 0 0 0]
准确率：0.0
预测结果： [0 0 0 0 0 0 0 0 0]
准确率：0.0
----------------------------------------------------------------------------------------------------


from hmz.math_model.predict import BP

test_size = 0.3
hidden_num = [5]
lr=0.01
epoch=10
batch_size=64
activate='relu'
criterion=None
optimizer='adam'
normalization=True

def run_BP(X=X, Y=Y):
    xtrain, xtest, ytrain, ytest = train_test_split(
        np.array(X, dtype=float), 
        np.squeeze(np.array(Y, dtype=float)), 
        test_size=test_size, 
        random_state=1, 
        shuffle=True, 
    )
#     print(ytrain, type(ytrain), type(ytrain), ytrain.shape)
    bp = BP(
        X.shape[1], hidden_num, 2, 
        lr=lr,
        epoch=epoch,
        optimizer=optimizer,
    )
    bp.train(xtrain, ytrain)
    yhat = bp.predict(xtest).cpu().detach().numpy()
    
    # 主要分类指标的文本报告
    print('主要分类指标的文本报告:')
    print(classification_report(ytest, yhat))
    # auc
    print('AUC:', roc_auc_score(ytest, yhat))
    
    # todo 预测
    pred = bp.predict(data_to_predict)
    print("预测结果：", pred[:8])
    print(color(f"合格率：{pred[:8].sum() / len(pred[:8])}"),  )
    print("预测结果：", pred[8:])
    print(color(f"合格率：{pred[8:].sum() / len(pred[8:])}"),  )
    return bp
bp = run_BP()

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Linear-1                    [64, 5]              40
            Linear-2                    [64, 5]              40
              ReLU-3                    [64, 5]               0
              ReLU-4                    [64, 5]               0
            Linear-5                    [64, 2]              12
            Linear-6                    [64, 2]              12
================================================================
Total params: 104
Trainable params: 104
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.00
Estimated Total Size (MB): 0.01
----------------------------------------------------------------

epoch: 9, train acc: 0.72, train loss: 8.36, eval acc: 0.73, eval loss: 2.25: 100%|████| 10/10 [00:00<00:00, 19.79it/s]

主要分类指标的文本报告:
              precision    recall  f1-score   support

         0.0       0.76      0.96      0.85       358
         1.0       0.62      0.19      0.30       134

    accuracy                           0.75       492
   macro avg       0.69      0.57      0.57       492
weighted avg       0.72      0.75      0.70       492

AUC: 0.5746685566580505
预测结果： tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
合格率：0.0
预测结果： tensor([1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
合格率：1.0


X = pd.concat([data_part1.iloc[:, 1:], data_part3.iloc[:, 1:], data_part4_need.iloc[:, 1:]], axis=1)
Ys = data_part2_.iloc[:, 1:]


cond = (pd.notna(X).iloc[:, 0] == True)
remain_index = X[cond].index


X = X[cond]
Y = Ys[cond].replace(to_replace=[True, False], value=[1, 0])
print(X.shape, Y.shape)

(1640, 7) (1640, 4)


# todo 找到产品是否合格
def is_qualified2(x):
    return 77.78 < x[0] < 80.33 and x[1] < 24.15 and x[2] < 17.15 and x[3] < 15.62


# todo all sklearn models：所有 sklearn 中的（较好）回归模型

from copy import copy
from sklearn.linear_model import LinearRegression as LR
from sklearn.tree import ExtraTreeRegressor as ETC, DecisionTreeRegressor as DTC
from sklearn.ensemble import AdaBoostRegressor as ABC
from sklearn.ensemble import BaggingRegressor as BC
from sklearn.ensemble import ExtraTreesRegressor as ETC
from sklearn.ensemble import GradientBoostingRegressor as GBC
from sklearn.ensemble import RandomForestRegressor as RFC
from xgboost import XGBRegressor as XGBC
from xgboost import XGBRFRegressor as XGBRFC

models_name = [
    "线性回归",
    "决策树",
    "随机森林",
    "XGBoost",
]

models = [
    LR(), 
    ETC(), 
    RFC(), 
    XGBC(), 
#     XGBRFC(), 
]


for model in models:
    print("模型：", model)
    xx = np.array(X)  # (n, 7)
    yy = np.array(Y)  # (n, 4)
    xtrain, xtest, ytrain, ytest = train_test_split(
        xx, yy,
        test_size=0.3, 
        random_state=10, 
        shuffle=True,
    )
    
    yhats = []
    ypreds = []
    for i in range(yy.shape[1]):
        m = copy(model)
        m.fit(xtrain, ytrain[:, i])
        yhat = m.predict(xtest)
        yhats.append(list(yhat))
        ypred = m.predict(data_to_predict)
        ypreds.append(list(ypred))
        
    yhats = pd.DataFrame(yhats, index=["指标A", "指标B", "指标C", "指标D"]).T
    ytest = pd.DataFrame(ytest, columns=["指标A", "指标B", "指标C", "指标D"])
    
    yhats = np.squeeze(pd.DataFrame(yhats.apply(is_qualified2, axis=1)))
    ytest = np.squeeze(pd.DataFrame(ytest.apply(is_qualified2, axis=1)))
    
    # 主要分类指标的文本报告
    print('主要分类指标的文本报告:')
    print(classification_report(ytest, yhats))
    # auc
    print('AUC:', roc_auc_score(ytest, yhats))
    print()
    
    # todo 预测结果
    ypreds = pd.DataFrame(ypreds, index=["指标A", "指标B", "指标C", "指标D"]).T
    ypreds = np.array(pd.DataFrame(ypreds.apply(is_qualified2, axis=1)))
    pred = ypreds
#     pred = np.reshape(ypreds, (-1, 1))
    print("预测结果：", pred[:8])
    print(color(f"合格率：{pred[:8].sum() / len(pred[:8])}"),  )
    print("预测结果：", pred[8:])
    print(color(f"合格率：{pred[8:].sum() / len(pred[8:])}"),  )
    print('-'*100)

模型： LinearRegression()
主要分类指标的文本报告:
              precision    recall  f1-score   support

       False       0.77      0.62      0.69       355
        True       0.35      0.53      0.42       137

    accuracy                           0.59       492
   macro avg       0.56      0.57      0.55       492
weighted avg       0.66      0.59      0.61       492

AUC: 0.5748740618895858

预测结果： [[False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]]
合格率：0.0
预测结果： [[False]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [False]]
合格率：0.5555555555555556
----------------------------------------------------------------------------------------------------
模型： ExtraTreesRegressor()
主要分类指标的文本报告:
              precision    recall  f1-score   support

       False       0.84      0.85      0.85       355
        True       0.60      0.59      0.60       137

    accuracy                           0.78       492
   macro avg       0.72      0.72      0.72       492
weighted avg       0.78      0.78      0.78       492

AUC: 0.7209725506322607

预测结果： [[ True]
 [ True]
 [ True]
 [False]
 [False]
 [False]
 [ True]
 [ True]]
合格率：0.625
预测结果： [[False]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [False]]
合格率：0.5555555555555556
----------------------------------------------------------------------------------------------------
模型： RandomForestRegressor()
主要分类指标的文本报告:
              precision    recall  f1-score   support

       False       0.83      0.84      0.83       355
        True       0.57      0.54      0.55       137

    accuracy                           0.76       492
   macro avg       0.70      0.69      0.69       492
weighted avg       0.75      0.76      0.76       492

AUC: 0.6911997532641102

预测结果： [[ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]]
合格率：1.0
预测结果： [[False]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [ True]]
合格率：0.6666666666666666
----------------------------------------------------------------------------------------------------
模型： XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
主要分类指标的文本报告:
              precision    recall  f1-score   support

       False       0.85      0.83      0.84       355
        True       0.59      0.61      0.60       137

    accuracy                           0.77       492
   macro avg       0.72      0.72      0.72       492
weighted avg       0.78      0.77      0.77       492

AUC: 0.7234707515163977

预测结果： [[False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]]
合格率：0.0
预测结果： [[False]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [False]]
合格率：0.2222222222222222
----------------------------------------------------------------------------------------------------


# todo my bp nn model：自己写的 BP 神经网络
from hmz.math_model.predict import BP

test_size = 0.3
hidden_num = [10]
lr=0.01
epoch=10
batch_size=64
activate='relu'
criterion=None
optimizer='adam'
normalization=True

def run_BP(X=X, Y=Y):
    print("BP模型：")
    xx = np.array(X, dtype=float)  # (n, 7)
    yy = np.array(Y, dtype=float)  # (n, 4)
    xtrain, xtest, ytrain, ytest = train_test_split(
        xx, yy,
        test_size=0.3, 
        random_state=10, 
        shuffle=True,
    )
    
    yhats = []
    ypreds = []
    for i in range(yy.shape[1]):
        bp = BP(
            X.shape[1], hidden_num, 1, 
            lr=lr,
            epoch=epoch,
            optimizer=optimizer,
            activate='sigmoid',
        )
        
        bp.train(xtrain, ytrain[:, i])
        yhat = bp.predict(xtest).cpu().detach().numpy()
        yhats.append(list(yhat))
        ypred = bp.predict(data_to_predict).cpu().detach().numpy()
        ypreds.append(list(np.squeeze(ypred)))
#     print(ypreds)
    yhats = pd.DataFrame(yhats, index=["指标A", "指标B", "指标C", "指标D"]).T
    ytest = pd.DataFrame(ytest, columns=["指标A", "指标B", "指标C", "指标D"])
    
    yhats = np.squeeze(pd.DataFrame(yhats.apply(is_qualified2, axis=1))).astype(float)
    ytest = np.squeeze(pd.DataFrame(ytest.apply(is_qualified2, axis=1))).astype(float)
    print(ytest)
#     print(yhats, ytest)
    # 主要分类指标的文本报告
    print('主要分类指标的文本报告:')
    print(classification_report(ytest, yhats))
    # auc
    print('AUC:', roc_auc_score(ytest, yhats))
    print()
    
    # todo 预测结果
    ypreds = pd.DataFrame(ypreds, index=["指标A", "指标B", "指标C", "指标D"]).T
    ypreds = np.array(pd.DataFrame(ypreds.apply(is_qualified2, axis=1)))
    pred = ypreds
#     pred = np.reshape(ypreds, (-1, 1))
    print("预测结果：", pred[:8])
    print(color(f"合格率：{pred[:8].sum() / len(pred[:8])}"),  )
    print("预测结果：", pred[8:])
    print(color(f"合格率：{pred[8:].sum() / len(pred[8:])}"),  )
    print('-'*100)
    return bp
bp = run_BP()

BP模型：
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Linear-1                   [64, 10]              80
            Linear-2                   [64, 10]              80
           Sigmoid-3                   [64, 10]               0
           Sigmoid-4                   [64, 10]               0
            Linear-5                    [64, 1]              11
            Linear-6                    [64, 1]              11
================================================================
Total params: 182
Trainable params: 182
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 0.00
Estimated Total Size (MB): 0.02
----------------------------------------------------------------

epoch: 9, train loss: 62632.07, eval loss: 16166.99: 100%|█████████████████████████████| 10/10 [00:00<00:00, 22.90it/s]

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Linear-1                   [64, 10]              80
            Linear-2                   [64, 10]              80
           Sigmoid-3                   [64, 10]               0
           Sigmoid-4                   [64, 10]               0
            Linear-5                    [64, 1]              11
            Linear-6                    [64, 1]              11
================================================================
Total params: 182
Trainable params: 182
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 0.00
Estimated Total Size (MB): 0.02
----------------------------------------------------------------

epoch: 9, train loss: 2210.94, eval loss: 518.19: 100%|████████████████████████████████| 10/10 [00:00<00:00, 22.50it/s]

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Linear-1                   [64, 10]              80
            Linear-2                   [64, 10]              80
           Sigmoid-3                   [64, 10]               0
           Sigmoid-4                   [64, 10]               0
            Linear-5                    [64, 1]              11
            Linear-6                    [64, 1]              11
================================================================
Total params: 182
Trainable params: 182
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 0.00
Estimated Total Size (MB): 0.02
----------------------------------------------------------------

epoch: 9, train loss: 45.21, eval loss: 10.84: 100%|███████████████████████████████████| 10/10 [00:00<00:00, 22.87it/s]

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Linear-1                   [64, 10]              80
            Linear-2                   [64, 10]              80
           Sigmoid-3                   [64, 10]               0
           Sigmoid-4                   [64, 10]               0
            Linear-5                    [64, 1]              11
            Linear-6                    [64, 1]              11
================================================================
Total params: 182
Trainable params: 182
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 0.00
Estimated Total Size (MB): 0.02
----------------------------------------------------------------

epoch: 9, train loss: 317.33, eval loss: 82.73: 100%|██████████████████████████████████| 10/10 [00:00<00:00, 21.42it/s]

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
487    1.0
488    0.0
489    0.0
490    1.0
491    0.0
Name: 0, Length: 492, dtype: float64
主要分类指标的文本报告:
              precision    recall  f1-score   support

         0.0       0.72      1.00      0.84       355
         1.0       0.00      0.00      0.00       137

    accuracy                           0.72       492
   macro avg       0.36      0.50      0.42       492
weighted avg       0.52      0.72      0.60       492

AUC: 0.5

预测结果： [[False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]]
合格率：0.0
预测结果： [[False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]]
合格率：0.0
----------------------------------------------------------------------------------------------------


index_num = Y.shape[1]
index_name = ["指标A", "指标B", "指标C", "指标D"]
index_colors = ["red", "lightpink", "darkorange", "khaki", "green", "lightgreen", "blue", "lightblue"]

models_name = [
    "XGBoost",
]

models = [
    XGBC(), 
]
model_name = 'XGBoost'
datadata = []
width = 1000
height = 700

for model in models:
    print("模型：", model)
    xx = np.array(X)  # (n, 7)
    yy = np.array(Y)  # (n, 4)
    xtrain, xtest, ytrain, ytest = train_test_split(
        xx, yy,
        test_size=0.3, 
        random_state=10, 
        shuffle=True,
    )
    
    yhats = []
    ypreds = []
    for i in range(yy.shape[1]):
        m = copy(model)
        m.fit(xtrain, ytrain[:, i])
        yhat = m.predict(xtest)
        yhats.append(list(yhat))
        
        ypred = m.predict(data_to_predict)
        ypreds.append(list(ypred))
        
        # 画图
        fig_y = m.predict(xx)
        datadata.append(go.Scatter(
            x=data_part1.iloc[:, 0], y=yy[:, i], 
            name=index_name[i] + "-真实值", 
            line=dict(color=index_colors[i * 2 + 1], width=1)), 
        )
        datadata.append(go.Scatter(
            x=data_part1.iloc[:, 0], y=fig_y,
            name=index_name[i] + "-预测值", 
            line=dict(color=index_colors[i * 2], width=1)), 
        )
        
        # todo 画图：点差图
        cols = ["指标A", "指标B", "指标C", "指标D"][i]
        Yhat = pd.DataFrame(fig_y)
        Y.index = [i for i in range(len(yy[:, i]))]
        Y_data = pd.concat([pd.Series(yy[:, i]), pd.Series(fig_y)], axis=1)
        Y_data.columns = ["真实值", "预测值"]
        
        Y_data.figure(
            kind='spread', 
            color=[index_colors[i * 2 + 1], index_colors[i * 2]], 
            title='基于' +  model_name + '的' + cols + '预测模型', 
        ).write_image('./img/问题3-基于' +  model_name + '的' + cols + '预测模型.svg', width=width, height=height)
        
        
    yhats = pd.DataFrame(yhats, index=["指标A", "指标B", "指标C", "指标D"]).T
    ytest = pd.DataFrame(ytest, columns=["指标A", "指标B", "指标C", "指标D"])
    
    yhats = np.squeeze(pd.DataFrame(yhats.apply(is_qualified2, axis=1)))
    ytest = np.squeeze(pd.DataFrame(ytest.apply(is_qualified2, axis=1)))
    
    # 主要分类指标的文本报告
    print('主要分类指标的文本报告:')
    print(classification_report(ytest, yhats))
    # auc
    print('AUC:', roc_auc_score(ytest, yhats))
    print()

    # todo 预测结果
    ypreds = pd.DataFrame(ypreds, index=["指标A", "指标B", "指标C", "指标D"]).T
    ypreds = np.array(pd.DataFrame(ypreds.apply(is_qualified2, axis=1)))
    pred = ypreds
#     pred = np.reshape(ypreds, (-1, 1))
    print("预测结果：", pred[:8])
    print(color(f"合格率：{pred[:8].sum() / len(pred[:8])}"),  )
    print("预测结果：", pred[8:])
    print(color(f"合格率：{pred[8:].sum() / len(pred[8:])}"),  )
    print()
    
    # 画图
    
    fig = go.Figure(data=datadata, )
    
    annotations = []
    annotations.append(dict(
        x=0.5, y=-0.1,
        xref='paper', yref='paper', 
        xanchor='center', yanchor='top',
        text='时间',
        font=dict(size=16),
        showarrow=False, 
    ))
    fig.update_layout(
        title='基于' +  model_name + '的指标预测模型',
        annotations=annotations,
        template="plotly_white",
    )
    fig.write_image('./img/问题3-基于' + model_name + '的指标预测模型.svg', width=width, height=height)
    fig.show()

模型： XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
主要分类指标的文本报告:
              precision    recall  f1-score   support

       False       0.85      0.83      0.84       355
        True       0.59      0.61      0.60       137

    accuracy                           0.77       492
   macro avg       0.72      0.72      0.72       492
weighted avg       0.78      0.77      0.77       492

AUC: 0.7234707515163977

预测结果： [[False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]]
合格率：0.0
预测结果： [[False]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [False]]
合格率：0.2222222222222222

	时间 (Time)	指标A (index A)	指标B (index B)	指标C (index C)	指标D (index D)
0	2022-01-25 02:50:00	79.08	23.52	12.41	17.86
1	2022-01-25 03:50:00	79.29	22.94	11.72	17.86
2	2022-01-25 04:50:00	79.95	21.42	10.68	17.63
3	2022-01-25 05:50:00	80.20	21.20	10.16	16.92
4	2022-01-25 06:50:00	80.38	20.75	10.16	15.75
...	...	...	...	...	...
1720	2022-04-07 19:50:00	79.82	23.84	11.03	13.52
1721	2022-04-07 20:50:00	78.98	25.36	11.37	12.85
1722	2022-04-07 21:50:00	78.86	25.40	11.37	11.42
1723	2022-04-07 22:50:00	79.10	25.58	11.37	11.55
1724	2022-04-07 23:50:00	79.32	24.82	11.03	11.55

	过程数据1 (Process parameter 1)	过程数据2 (Process parameter 2)	过程数据3 (Process parameter 3)	过程数据4 (Process parameter 4)
过程数据1 (Process parameter 1)	1.000000	NaN	0.058849	-0.147755
过程数据2 (Process parameter 2)	NaN	NaN	NaN	NaN
过程数据3 (Process parameter 3)	0.058849	NaN	1.000000	-0.497288
过程数据4 (Process parameter 4)	-0.147755	NaN	-0.497288	1.000000

	时间 (Time)	原矿质量
0	2022-01-25 02:30:11	407.39
1	2022-01-25 02:30:11	407.39
2	2022-01-25 02:30:11	407.39
3	2022-01-25 05:30:13	406.89
4	2022-01-25 05:30:13	406.89
...	...	...
1720	2022-04-07 20:30:17	485.59
1721	2022-04-07 20:30:17	485.59
1722	2022-04-07 23:30:10	440.34
1723	2022-04-07 23:30:10	440.34
1724	2022-04-07 23:30:10	440.34

	系统I温度 (Temperature of system I)	系统II温度 (Temperature of system II)	原矿参数1 (Mineral parameter 1)	原矿参数2 (Mineral parameter 2)	原矿参数3 (Mineral parameter 3)	原矿参数4 (Mineral parameter 4)	原矿质量
0	1347.49	950.40	55.26	108.03	43.29	20.92	407.39
1	1274.43	938.20	55.26	108.03	43.29	20.92	407.39
2	1273.86	938.16	55.26	108.03	43.29	20.92	407.39
3	1273.51	937.49	55.26	108.03	43.29	20.92	406.89
4	1272.84	936.67	55.26	108.03	43.29	20.92	406.89
...	...	...	...	...	...	...	...
1720	437.71	540.70	54.4	105.14	49.03	20.82	485.59
1721	494.23	557.21	54.4	105.14	49.03	20.82	485.59
1722	495.47	557.68	54.4	105.14	49.03	20.82	440.34
1723	494.41	572.00	54.4	105.14	49.03	20.82	440.34
1724	495.03	571.61	54.4	105.14	49.03	20.82	440.34

问题3¶

准备数据¶

表1——温度(temperature)¶

表2——产品质量(quality of the products)¶

表3——原矿参数(mineral parameter)¶

表4——过程数据(process parameter)¶

预测是否合格-思路1¶

1. 逻辑回归¶

2. 决策树¶

3. 随机森林¶

4. XGBoost¶

5. BP¶

预测是否合格-思路2¶

准备数据¶

预测指标：线性回归、决策树、随机森林、XGBoost、BP¶

首先查看哪个模型效果最好¶

最终筛选出 XGBoost 效果最好，四个指标都是¶