参考:A Primer about Machine Learning in Catalysis – A Tutorial with Code - Palkovits - 2020 - ChemCatChem - Wiley Online Library

1 导入数据库

from xml.dom.minidom import Element
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from soupsieve import select
import sklearn as sk
import tensorflow as tf
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
pd.set_option('display.max_columns', 5)
url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-019-08325-8/MediaObjects/41467_2019_8325_MOESM3_ESM.xls'
raw_data = pd.read_excel(url,sheet_name='OCM_Dataset_corrected')

2 读取数据

cation1=raw_data.pivot(columns='Cation 1', values='Cation 1 mol%')
cation2=raw_data.pivot( columns='Cation 2' ,values='Cation 2 mol%' )
cation3=raw_data.pivot(columns='Cation 3' , values= 'Cation 3 mol%')
cation4 = raw_data.pivot(columns='Cation 4',values='Cation 4 mol%')
anion1=raw_data.pivot(columns ='Anion 1' , values= 'Anion 1 mol%')
anion2 = raw_data.pivot(columns = 'Anion 2' , values= 'Anion 2 mol%')
support1=raw_data.pivot(columns ='Support 1' , values= 'Support 1 mol%')
support2 = raw_data.pivot(columns= 'Support 2' , values= 'Support 2 mol%' )

3 数据处理

pivot_lists = [cation1,cation2,cation3,cation4,anion1,anion2,support1,support2]
concat_pivot_lists = pd.concat(pivot_lists,axis=1,sort=True)
composition = concat_pivot_lists.groupby(level=0,axis=1).sum()/100
pub_nr = raw_data.iloc[:,1]
reaction_data = raw_data.iloc[:,19:]
clean_list = [pub_nr,composition,reaction_data]
data_cleaned = pd.concat(clean_list,axis=1,sort=True)
print(composition.mean(axis=0).sort_values(ascending=False).head(18).keys())
count = 1
for i in composition.mean(axis=0).sort_values(ascending=False).head(9).keys():
    plt.subplot(3,3,count)
    plt.scatter(data_cleaned[i],data_cleaned['S(C2), %'])
    plt.title(i)
    plt.ylabel('mol%')
    count += 1

4 K-means聚类

## 聚类的目的:找到最常用的几种配比,并将其分类
k_ellbow = []
for i in range(1,21):
    kmeans = KMeans(n_clusters=i).fit(composition)
    score = kmeans.score(composition)
    k_ellbow.append(score)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Score')
plt.plot([7,7],[0,np.max(np.gradient(k_ellbow))])
plt.plot(range(1,21),np.gradient(k_ellbow))
## import seaborn as sns
## com1 = composition.iloc[:,0:2]
## kmeans = KMeans(n_clusters=3).fit(com1)
## sns.scatterplot(data=com1, x="Ag", y="Al",hue=kmeans.labels_)
## plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 
##             marker="X", c="r", s=80, label="centroids")
## plt.show()
cluster = KMeans(n_clusters=7).fit(composition)
cluster_pred = cluster.predict(composition) # 对于训练数据等同于cluster.labels_,给每个数据点上标签。

5 PCA分析

pca = PCA(n_components=2)
pca_result = pca.fit_transform(composition)

6 TSNE分析

tsne = TSNE(n_components=2, random_state=42,init='pca')
tsne_results = tsne.fit_transform(composition)
fig ,axs = plt.subplots(1,2)
plot1 = axs[0].scatter(tsne_results[:,0],tsne_results[:,1],c=cluster_pred,cmap=plt.cm.get_cmap('viridis',7))
axs[0].set_title('t-SNE')
plot2 = axs[1].scatter(pca_result[:,0],pca_result[:,1],c=cluster_pred,cmap=plt.cm.get_cmap('viridis',7))
axs[1].set_title('PCA')
fig.colorbar(plot2,ax=axs)
count = 0
fig,axs = plt.subplots(3,3)
element = composition.mean(axis=0).sort_values(ascending=False).head(9).keys()
for ax in axs.flat:
    ax.scatter(data_cleaned[element[count]],data_cleaned['S(C2), %'],c=cluster_pred,cmap=plt.cm.get_cmap('viridis',7))
    ax.set_title(element[count])
    count += 1

7 SVR支持向量回归

from sklearn.model_selection import train_test_split
x = composition.fillna(0)
y = data_cleaned.iloc[:,-2].fillna(0)/100
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
hyper_params_svr = {'gamma':np.random.normal(12,2,10000),'C':np.random.normal(1,0.2,10000)} # 从10000个参数中随机选择10个
svr_tune = SVR(kernel='rbf')
g_search_svr = RandomizedSearchCV(svr_tune,hyper_params_svr,cv=5,n_jobs=-1,random_state=42) # n_jobs=-1表示使用所有的CPU,n_iter默认为10
g_search_svr.fit(x_train,y_train)
print('Best Estimator:',g_search_svr.best_estimator_)
best_C = g_search_svr.best_estimator_.C
best_gamma = g_search_svr.best_estimator_.gamma
svr_rbf = SVR(kernel='rbf',C=best_C,gamma=best_gamma) # 模型训练
y_svr = svr_rbf.fit(x_train,y_train)
predict_svr_train = y_svr.predict(x_train)
predict_svr_test = y_svr.predict(x_test)

print('SVR Train Score:',svr_rbf.score(x_train,y_train)) # 预测和真实值相同的概率
print('SVR Test Score:',svr_rbf.score(x_test,y_test))
plt.scatter(y_test,predict_svr_test) # 如果数据点分布在y=x上说明训练效果好。
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('SVR Test')
plt.show()

8 RF随机森林回归

from sklearn.ensemble import RandomForestRegressor
hyper_params_rdf = {'n_estimators':np.arange(1,25,1)} # 从24个值中选择10个
rdf_tune = RandomForestRegressor()
g_search_rdf = RandomizedSearchCV(rdf_tune,hyper_params_rdf,cv=5,n_jobs=-1,random_state=42) # 5折交叉验证
g_search_rdf.fit(x_train,y_train)
print('Best Estimator:',g_search_svr.best_estimator_)
estimate_rdf = g_search_rdf.best_estimator_.n_estimators # 树的数量
regressor = RandomForestRegressor(n_estimators=estimate_rdf,random_state=42) # 建立模型
regressor.fit(x_train,y_train) # 训练模型
y_pred_train = regressor.predict(x_train)
y_pred_test = regressor.predict(x_test)

print('RDF Train Score:',regressor.score(x_train,y_train)) # score即R2 score
print('RDF Test Score:',regressor.score(x_test,y_test))
plt.scatter(y_test,y_pred_test) # 如果数据点分布在y=x上说明训练效果好。
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('RDF Test')
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

mse_rdf_train = mean_squared_error(y_train,y_pred_train) # 1/n * sum(y_train_i - y_pred_train_i)^2
mse_rdf_test = mean_squared_error(y_test,y_pred_test)

mse_svr_train = mean_squared_error(y_train,predict_svr_train)
mse_svr_test = mean_squared_error(y_test,predict_svr_test)

r2_rdf_train = r2_score(y_train,y_pred_train)
r2_rdf_test = r2_score(y_test,y_pred_test)

r2_svr_train = r2_score(y_train,predict_svr_train)
r2_svr_test = r2_score(y_test,predict_svr_test)

9 RDF、SVR分数对比

plt.subplot(121)
plt.bar(np.arange(4),[mse_rdf_test,mse_svr_test,mse_rdf_train,mse_svr_train])
plt.xticks(np.arange(4),['RDF Test','SVR Test','RDF Train','SVR Train'])
plt.ylabel('MSE')

plt.subplot(122)
plt.bar(np.arange(4),[r2_rdf_test,r2_svr_test,r2_rdf_train,r2_svr_train])
plt.xticks(np.arange(4),['RDF Test','SVR Test','RDF Train','SVR Train'])
plt.ylabel('R2')
svr_mean_cv_score = g_search_svr.cv_results_['mean_test_score'] # 每个参数的5折交叉验证的平均得分
rdf_mean_cv_score = g_search_rdf.cv_results_['mean_test_score']
plt.subplot(121)
plt.bar(np.arange(10),svr_mean_cv_score)
plt.title('SVR Mean CV Score')
plt.subplot(122)
plt.bar(np.arange(10),rdf_mean_cv_score)
plt.title('RDF Mean CV Score')
from sklearn.feature_selection import SelectFromModel

select_feature = SelectFromModel(RandomForestRegressor(n_estimators=estimate_rdf)) # 建立模型
select_feature.fit(x_train,y_train)
selected_feature = x_train.columns[(select_feature.get_support())] # 获取特征重要度大于均值的特征
print(selected_feature)

importances = regressor.feature_importances_
indices = np.argsort(importances)[::-1][:10] # 降序排序,返回索引,取前10个
plt.title('Feature Importance')
plt.bar(np.arange(10), importances[indices])
plt.xticks(np.arange(10), x_train.columns[indices], rotation=90)

10 深度学习分类和回归

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
tf.keras.backend.clear_session()
model = keras.Sequential([
    layers.Dense(73,activation='relu',
    input_shape=(x_train.shape[1],)),# 输入层为68个神经元,一共68个特征,链接到73个神经元
    layers.Dropout(0.2), # 随机丢弃20%的输入,未丢弃的输入乘1/(1-0.2)=1.25倍,防止过拟合
    layers.Dense(73,activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(73,activation='relu'), # 一共3个隐藏层,每个隐藏层73个神经元
    layers.Dropout(0.2),
    layers.Dense(1)
])
model.summary() # 查看模型结构
model.compile(optimizer='rmsprop',loss='mae') # 定义模型,优化器为RMSProp,可以逃离鞍点。损失函数为平均绝对误差
model.fit(x_train,y_train,epochs=500,validation_split=0.1,verbose=0) # 训练模型,验证集占训练集的10%,每个epoch输出一次,训练500个epoch,verbose=0表示不输出任何信息
plt.plot(model.history.epoch, np.array(model.history.history['loss']), label='Train') # 绘制训练损失曲线
plt.plot(model.history.epoch, np.array(model.history.history['val_loss']), label='Val') # 绘制验证损失曲线
predict_nn_test = model.predict(x_test) # 矩阵转向量
predict_nn_train = model.predict(x_train)
print('NN Train Score:',r2_score(y_train,predict_nn_train))
print('NN Test Score:',r2_score(y_test,predict_nn_test))
plt.scatter(y_test,predict_nn_test) # 如果数据点分布在y=x上说明训练效果好。
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('NN Test')