Scikit-learn 机器学习库详解
1. 什么是 Scikit-learn
Scikit-learn(简称sklearn)是Python中最流行的机器学习库之一,建立在NumPy、SciPy和matplotlib之上。它提供了简单高效的数据挖掘和数据分析工具,适合各种机器学习任务。
核心特性
- 简单易用:一致的API设计,易于学习和使用
- 全面覆盖:包含分类、回归、聚类、降维等算法
- 高效实现:底层使用C/Cython优化,性能优秀
- 良好文档:详细的文档和丰富的示例
- 活跃社区:持续更新和维护
2. 安装和环境配置
基础安装
bash
# 使用pip安装
pip install scikit-learn
# 或使用conda安装
conda install scikit-learn
# 安装完整的科学计算环境
pip install numpy pandas matplotlib seaborn jupyter验证安装
python
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
print(f"Scikit-learn版本: {sklearn.__version__}")
print(f"NumPy版本: {np.__version__}")
print(f"Pandas版本: {pd.__version__}")3. 核心概念和工作流程
3.1 机器学习工作流程
python
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# 1. 数据准备
# X, y = load_data()
# 2. 数据分割
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 3. 数据预处理
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 4. 模型训练
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
# 5. 预测和评估
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.4f}")3.2 估计器(Estimator)接口
python
# 所有sklearn算法都遵循相同的接口
class EstimatorInterface:
def fit(self, X, y=None):
"""训练模型"""
pass
def predict(self, X):
"""进行预测"""
pass
def score(self, X, y):
"""评估模型性能"""
pass4. 数据预处理
4.1 特征缩放
python
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import numpy as np
# 生成示例数据
X = np.random.randn(100, 3) * [10, 5, 2] + [100, 50, 20]
# 标准化(均值0,方差1)
scaler_std = StandardScaler()
X_std = scaler_std.fit_transform(X)
# 最小-最大缩放(0-1范围)
scaler_minmax = MinMaxScaler()
X_minmax = scaler_minmax.fit_transform(X)
# 鲁棒缩放(使用中位数和四分位数)
scaler_robust = RobustScaler()
X_robust = scaler_robust.fit_transform(X)
print("原始数据统计:")
print(f"均值: {X.mean(axis=0)}")
print(f"标准差: {X.std(axis=0)}")
print("\n标准化后:")
print(f"均值: {X_std.mean(axis=0)}")
print(f"标准差: {X_std.std(axis=0)}")4.2 特征编码
python
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
# 创建示例数据
data = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red', 'blue'],
'size': ['S', 'M', 'L', 'M', 'S'],
'price': [10, 20, 30, 15, 12]
})
# 标签编码
label_encoder = LabelEncoder()
data['color_encoded'] = label_encoder.fit_transform(data['color'])
# 独热编码
ct = ColumnTransformer([
('onehot', OneHotEncoder(drop='first'), ['color', 'size'])
], remainder='passthrough')
X_encoded = ct.fit_transform(data[['color', 'size', 'price']])
print("编码后的特征形状:", X_encoded.shape)4.3 处理缺失值
python
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np
# 创建带缺失值的数据
X = np.array([[1, 2, 3],
[4, np.nan, 6],
[7, 8, np.nan],
[np.nan, 11, 12]])
# 简单填充(均值、中位数、众数)
imputer_mean = SimpleImputer(strategy='mean')
X_mean = imputer_mean.fit_transform(X)
# KNN填充
imputer_knn = KNNImputer(n_neighbors=2)
X_knn = imputer_knn.fit_transform(X)
print("原始数据:")
print(X)
print("\n均值填充:")
print(X_mean)
print("\nKNN填充:")
print(X_knn)5. 监督学习算法
5.1 分类算法
python
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
# 生成分类数据
X, y = make_classification(n_samples=1000, n_features=20, n_classes=3,
n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 逻辑回归
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
# 随机森林
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
# 支持向量机
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
# 评估结果
print("逻辑回归结果:")
print(classification_report(y_test, lr_pred))
print("\n随机森林结果:")
print(classification_report(y_test, rf_pred))5.2 回归算法
python
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# 生成回归数据
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
# 岭回归(L2正则化)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
# Lasso回归(L1正则化)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
# 随机森林回归
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
rf_pred = rf_reg.predict(X_test)
# 评估结果
models = [('Linear', lr_pred), ('Ridge', ridge_pred),
('Lasso', lasso_pred), ('Random Forest', rf_pred)]
for name, pred in models:
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f"{name} - MSE: {mse:.4f}, R²: {r2:.4f}")6. 无监督学习算法
6.1 聚类算法
python
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
# 生成聚类数据
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60,
random_state=42)
# K-Means聚类
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(X)
# DBSCAN聚类
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)
# 层次聚类
agg_clustering = AgglomerativeClustering(n_clusters=4)
agg_labels = agg_clustering.fit_predict(X)
# 评估聚类效果
print(f"K-Means轮廓系数: {silhouette_score(X, kmeans_labels):.4f}")
print(f"DBSCAN轮廓系数: {silhouette_score(X, dbscan_labels):.4f}")
print(f"层次聚类轮廓系数: {silhouette_score(X, agg_labels):.4f}")
# 可视化结果
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
axes[0, 0].set_title('真实标签')
axes[0, 1].scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis')
axes[0, 1].set_title('K-Means')
axes[1, 0].scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis')
axes[1, 0].set_title('DBSCAN')
axes[1, 1].scatter(X[:, 0], X[:, 1], c=agg_labels, cmap='viridis')
axes[1, 1].set_title('层次聚类')
plt.tight_layout()
plt.show()6.2 降维算法
python
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# 加载手写数字数据集
digits = load_digits()
X, y = digits.data, digits.target
print(f"原始数据形状: {X.shape}")
# PCA降维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# t-SNE降维
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X[:1000]) # t-SNE计算较慢,使用部分数据
# 可视化降维结果
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# PCA结果
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10')
axes[0].set_title('PCA降维结果')
axes[0].set_xlabel(f'PC1 (解释方差: {pca.explained_variance_ratio_[0]:.2f})')
axes[0].set_ylabel(f'PC2 (解释方差: {pca.explained_variance_ratio_[1]:.2f})')
# t-SNE结果
scatter2 = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=y[:1000], cmap='tab10')
axes[1].set_title('t-SNE降维结果')
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
plt.colorbar(scatter1, ax=axes[0])
plt.colorbar(scatter2, ax=axes[1])
plt.tight_layout()
plt.show()
print(f"PCA累计解释方差比: {pca.explained_variance_ratio_.sum():.4f}")7. 模型选择和评估
7.1 交叉验证
python
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# 加载数据
iris = load_iris()
X, y = iris.data, iris.target
# 创建模型
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# 简单交叉验证
cv_scores = cross_val_score(rf, X, y, cv=5)
print(f"5折交叉验证分数: {cv_scores}")
print(f"平均分数: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
# 分层交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(rf, X, y, cv=skf)
print(f"分层交叉验证分数: {stratified_scores}")
print(f"平均分数: {stratified_scores.mean():.4f}")7.2 网格搜索调参
python
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
# 加载数据
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 网格搜索
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
'kernel': ['rbf', 'poly']
}
svm = SVC()
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")
# 在测试集上评估
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"测试集分数: {test_score:.4f}")7.3 学习曲线
python
from sklearn.model_selection import learning_curve
import numpy as np
import matplotlib.pyplot as plt
def plot_learning_curve(estimator, X, y, title="Learning Curve"):
train_sizes, train_scores, val_scores = learning_curve(
estimator, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10)
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练分数')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std,
alpha=0.1, color='blue')
plt.plot(train_sizes, val_mean, 'o-', color='red', label='验证分数')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std,
alpha=0.1, color='red')
plt.xlabel('训练样本数')
plt.ylabel('分数')
plt.title(title)
plt.legend()
plt.grid(True)
plt.show()
# 绘制学习曲线
rf = RandomForestClassifier(n_estimators=100, random_state=42)
plot_learning_curve(rf, X, y, "随机森林学习曲线")8. 集成学习
8.1 Bagging和Boosting
python
from sklearn.ensemble import (BaggingClassifier, RandomForestClassifier,
AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
# 创建基础分类器
base_classifier = DecisionTreeClassifier(random_state=42)
# Bagging
bagging = BaggingClassifier(base_classifier, n_estimators=100, random_state=42)
# 随机森林(Bagging的特殊情况)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# AdaBoost
ada_boost = AdaBoostClassifier(base_classifier, n_estimators=100, random_state=42)
# 梯度提升
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
# 训练和评估所有模型
models = [
('Bagging', bagging),
('Random Forest', rf),
('AdaBoost', ada_boost),
('Gradient Boosting', gb)
]
for name, model in models:
scores = cross_val_score(model, X, y, cv=5)
print(f"{name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")8.2 投票分类器
python
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
# 创建不同的分类器
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
nb = GaussianNB()
# 硬投票
hard_voting = VotingClassifier(
estimators=[('lr', lr), ('rf', rf), ('nb', nb)],
voting='hard'
)
# 软投票
soft_voting = VotingClassifier(
estimators=[('lr', lr), ('rf', rf), ('nb', nb)],
voting='soft'
)
# 比较性能
models = [('Logistic Regression', lr), ('Random Forest', rf),
('Naive Bayes', nb), ('Hard Voting', hard_voting),
('Soft Voting', soft_voting)]
for name, model in models:
scores = cross_val_score(model, X, y, cv=5)
print(f"{name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")9. 管道(Pipeline)
9.1 基础管道
python
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# 创建管道
pipe = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=10)),
('classifier', LogisticRegression())
])
# 训练管道
pipe.fit(X_train, y_train)
# 预测
y_pred = pipe.predict(X_test)
# 评估
accuracy = pipe.score(X_test, y_test)
print(f"管道准确率: {accuracy:.4f}")9.2 复合管道
python
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
# 创建混合数据
data = pd.DataFrame({
'numeric1': np.random.randn(1000),
'numeric2': np.random.randn(1000),
'category1': np.random.choice(['A', 'B', 'C'], 1000),
'category2': np.random.choice(['X', 'Y'], 1000)
})
# 定义预处理器
preprocessor = ColumnTransformer([
('num', StandardScaler(), ['numeric1', 'numeric2']),
('cat', OneHotEncoder(drop='first'), ['category1', 'category2'])
])
# 创建完整管道
full_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
# 使用管道进行网格搜索
param_grid = {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [None, 10, 20]
}
grid_search = GridSearchCV(full_pipeline, param_grid, cv=5)
# grid_search.fit(data, target) # 需要目标变量10. 实际项目示例
10.1 完整的分类项目
python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# 假设我们有一个客户流失预测项目
def customer_churn_analysis():
# 1. 数据加载和探索
# data = pd.read_csv('customer_data.csv')
# 2. 数据预处理
def preprocess_data(data):
# 处理缺失值
data = data.fillna(data.median())
# 编码分类变量
le = LabelEncoder()
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
if col != 'target': # 假设target是目标变量
data[col] = le.fit_transform(data[col])
return data
# 3. 特征工程
def feature_engineering(data):
# 创建新特征
data['total_charges_per_month'] = data['total_charges'] / data['tenure']
data['services_count'] = (data[['phone_service', 'internet_service',
'online_security']].sum(axis=1))
return data
# 4. 模型训练和评估
def train_and_evaluate(X, y):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 特征缩放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 模型训练
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
# 预测和评估
y_pred = rf.predict(X_test_scaled)
print("分类报告:")
print(classification_report(y_test, y_pred))
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
plt.show()
# 特征重要性
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('前10个重要特征')
plt.show()
return rf, scaler
# customer_churn_analysis() # 运行完整分析11. 最佳实践
11.1 数据泄露防护
python
# 错误做法:在分割数据前进行预处理
# X_scaled = StandardScaler().fit_transform(X) # 数据泄露!
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)
# 正确做法:在分割数据后分别预处理
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # 只在训练集上fit
X_test_scaled = scaler.transform(X_test) # 在测试集上只transform11.2 模型持久化
python
import joblib
# 保存模型
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
# 加载模型
loaded_model = joblib.load('model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
# 使用加载的模型进行预测
new_data_scaled = loaded_scaler.transform(new_data)
predictions = loaded_model.predict(new_data_scaled)11.3 性能监控
python
import time
from sklearn.metrics import accuracy_score
def benchmark_models(models, X_train, X_test, y_train, y_test):
results = []
for name, model in models.items():
# 训练时间
start_time = time.time()
model.fit(X_train, y_train)
train_time = time.time() - start_time
# 预测时间
start_time = time.time()
y_pred = model.predict(X_test)
predict_time = time.time() - start_time
# 准确率
accuracy = accuracy_score(y_test, y_pred)
results.append({
'model': name,
'accuracy': accuracy,
'train_time': train_time,
'predict_time': predict_time
})
return pd.DataFrame(results)
# 使用示例
models = {
'Random Forest': RandomForestClassifier(),
'SVM': SVC(),
'Logistic Regression': LogisticRegression()
}
# benchmark_results = benchmark_models(models, X_train, X_test, y_train, y_test)
# print(benchmark_results)12. 总结
Scikit-learn 是Python机器学习生态系统的核心库,提供了完整的机器学习工具链。从数据预处理到模型评估,sklearn都提供了简洁一致的API。
