Skip to content

Scikit-learn 机器学习库详解

1. 什么是 Scikit-learn

Scikit-learn(简称sklearn)是Python中最流行的机器学习库之一,建立在NumPy、SciPy和matplotlib之上。它提供了简单高效的数据挖掘和数据分析工具,适合各种机器学习任务。

核心特性

  • 简单易用:一致的API设计,易于学习和使用
  • 全面覆盖:包含分类、回归、聚类、降维等算法
  • 高效实现:底层使用C/Cython优化,性能优秀
  • 良好文档:详细的文档和丰富的示例
  • 活跃社区:持续更新和维护

2. 安装和环境配置

基础安装

bash
# 使用pip安装
pip install scikit-learn

# 或使用conda安装
conda install scikit-learn

# 安装完整的科学计算环境
pip install numpy pandas matplotlib seaborn jupyter

验证安装

python
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print(f"Scikit-learn版本: {sklearn.__version__}")
print(f"NumPy版本: {np.__version__}")
print(f"Pandas版本: {pd.__version__}")

3. 核心概念和工作流程

3.1 机器学习工作流程

python
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. 数据准备
# X, y = load_data()

# 2. 数据分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. 数据预处理
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. 模型训练
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# 5. 预测和评估
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.4f}")

3.2 估计器(Estimator)接口

python
# 所有sklearn算法都遵循相同的接口
class EstimatorInterface:
    def fit(self, X, y=None):
        """训练模型"""
        pass
    
    def predict(self, X):
        """进行预测"""
        pass
    
    def score(self, X, y):
        """评估模型性能"""
        pass

4. 数据预处理

4.1 特征缩放

python
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import numpy as np

# 生成示例数据
X = np.random.randn(100, 3) * [10, 5, 2] + [100, 50, 20]

# 标准化(均值0,方差1)
scaler_std = StandardScaler()
X_std = scaler_std.fit_transform(X)

# 最小-最大缩放(0-1范围)
scaler_minmax = MinMaxScaler()
X_minmax = scaler_minmax.fit_transform(X)

# 鲁棒缩放(使用中位数和四分位数)
scaler_robust = RobustScaler()
X_robust = scaler_robust.fit_transform(X)

print("原始数据统计:")
print(f"均值: {X.mean(axis=0)}")
print(f"标准差: {X.std(axis=0)}")
print("\n标准化后:")
print(f"均值: {X_std.mean(axis=0)}")
print(f"标准差: {X_std.std(axis=0)}")

4.2 特征编码

python
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

# 创建示例数据
data = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'red', 'blue'],
    'size': ['S', 'M', 'L', 'M', 'S'],
    'price': [10, 20, 30, 15, 12]
})

# 标签编码
label_encoder = LabelEncoder()
data['color_encoded'] = label_encoder.fit_transform(data['color'])

# 独热编码
ct = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first'), ['color', 'size'])
], remainder='passthrough')

X_encoded = ct.fit_transform(data[['color', 'size', 'price']])
print("编码后的特征形状:", X_encoded.shape)

4.3 处理缺失值

python
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np

# 创建带缺失值的数据
X = np.array([[1, 2, 3],
              [4, np.nan, 6],
              [7, 8, np.nan],
              [np.nan, 11, 12]])

# 简单填充(均值、中位数、众数)
imputer_mean = SimpleImputer(strategy='mean')
X_mean = imputer_mean.fit_transform(X)

# KNN填充
imputer_knn = KNNImputer(n_neighbors=2)
X_knn = imputer_knn.fit_transform(X)

print("原始数据:")
print(X)
print("\n均值填充:")
print(X_mean)
print("\nKNN填充:")
print(X_knn)

5. 监督学习算法

5.1 分类算法

python
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# 生成分类数据
X, y = make_classification(n_samples=1000, n_features=20, n_classes=3, 
                          n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 逻辑回归
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# 随机森林
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# 支持向量机
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# 评估结果
print("逻辑回归结果:")
print(classification_report(y_test, lr_pred))
print("\n随机森林结果:")
print(classification_report(y_test, rf_pred))

5.2 回归算法

python
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 生成回归数据
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# 岭回归(L2正则化)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

# Lasso回归(L1正则化)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)

# 随机森林回归
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
rf_pred = rf_reg.predict(X_test)

# 评估结果
models = [('Linear', lr_pred), ('Ridge', ridge_pred), 
          ('Lasso', lasso_pred), ('Random Forest', rf_pred)]

for name, pred in models:
    mse = mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f"{name} - MSE: {mse:.4f}, R²: {r2:.4f}")

6. 无监督学习算法

6.1 聚类算法

python
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# 生成聚类数据
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, 
                       random_state=42)

# K-Means聚类
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

# DBSCAN聚类
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)

# 层次聚类
agg_clustering = AgglomerativeClustering(n_clusters=4)
agg_labels = agg_clustering.fit_predict(X)

# 评估聚类效果
print(f"K-Means轮廓系数: {silhouette_score(X, kmeans_labels):.4f}")
print(f"DBSCAN轮廓系数: {silhouette_score(X, dbscan_labels):.4f}")
print(f"层次聚类轮廓系数: {silhouette_score(X, agg_labels):.4f}")

# 可视化结果
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
axes[0, 0].set_title('真实标签')
axes[0, 1].scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis')
axes[0, 1].set_title('K-Means')
axes[1, 0].scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis')
axes[1, 0].set_title('DBSCAN')
axes[1, 1].scatter(X[:, 0], X[:, 1], c=agg_labels, cmap='viridis')
axes[1, 1].set_title('层次聚类')
plt.tight_layout()
plt.show()

6.2 降维算法

python
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# 加载手写数字数据集
digits = load_digits()
X, y = digits.data, digits.target

print(f"原始数据形状: {X.shape}")

# PCA降维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# t-SNE降维
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X[:1000])  # t-SNE计算较慢,使用部分数据

# 可视化降维结果
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# PCA结果
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10')
axes[0].set_title('PCA降维结果')
axes[0].set_xlabel(f'PC1 (解释方差: {pca.explained_variance_ratio_[0]:.2f})')
axes[0].set_ylabel(f'PC2 (解释方差: {pca.explained_variance_ratio_[1]:.2f})')

# t-SNE结果
scatter2 = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=y[:1000], cmap='tab10')
axes[1].set_title('t-SNE降维结果')
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')

plt.colorbar(scatter1, ax=axes[0])
plt.colorbar(scatter2, ax=axes[1])
plt.tight_layout()
plt.show()

print(f"PCA累计解释方差比: {pca.explained_variance_ratio_.sum():.4f}")

7. 模型选择和评估

7.1 交叉验证

python
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# 加载数据
iris = load_iris()
X, y = iris.data, iris.target

# 创建模型
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 简单交叉验证
cv_scores = cross_val_score(rf, X, y, cv=5)
print(f"5折交叉验证分数: {cv_scores}")
print(f"平均分数: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 分层交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(rf, X, y, cv=skf)
print(f"分层交叉验证分数: {stratified_scores}")
print(f"平均分数: {stratified_scores.mean():.4f}")

7.2 网格搜索调参

python
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer

# 加载数据
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 网格搜索
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

svm = SVC()
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")

# 在测试集上评估
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"测试集分数: {test_score:.4f}")

7.3 学习曲线

python
from sklearn.model_selection import learning_curve
import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(estimator, X, y, title="Learning Curve"):
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 10)
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练分数')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, 
                     alpha=0.1, color='blue')
    
    plt.plot(train_sizes, val_mean, 'o-', color='red', label='验证分数')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, 
                     alpha=0.1, color='red')
    
    plt.xlabel('训练样本数')
    plt.ylabel('分数')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

# 绘制学习曲线
rf = RandomForestClassifier(n_estimators=100, random_state=42)
plot_learning_curve(rf, X, y, "随机森林学习曲线")

8. 集成学习

8.1 Bagging和Boosting

python
from sklearn.ensemble import (BaggingClassifier, RandomForestClassifier, 
                             AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier

# 创建基础分类器
base_classifier = DecisionTreeClassifier(random_state=42)

# Bagging
bagging = BaggingClassifier(base_classifier, n_estimators=100, random_state=42)

# 随机森林(Bagging的特殊情况)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# AdaBoost
ada_boost = AdaBoostClassifier(base_classifier, n_estimators=100, random_state=42)

# 梯度提升
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# 训练和评估所有模型
models = [
    ('Bagging', bagging),
    ('Random Forest', rf),
    ('AdaBoost', ada_boost),
    ('Gradient Boosting', gb)
]

for name, model in models:
    scores = cross_val_score(model, X, y, cv=5)
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

8.2 投票分类器

python
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# 创建不同的分类器
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
nb = GaussianNB()

# 硬投票
hard_voting = VotingClassifier(
    estimators=[('lr', lr), ('rf', rf), ('nb', nb)],
    voting='hard'
)

# 软投票
soft_voting = VotingClassifier(
    estimators=[('lr', lr), ('rf', rf), ('nb', nb)],
    voting='soft'
)

# 比较性能
models = [('Logistic Regression', lr), ('Random Forest', rf), 
          ('Naive Bayes', nb), ('Hard Voting', hard_voting), 
          ('Soft Voting', soft_voting)]

for name, model in models:
    scores = cross_val_score(model, X, y, cv=5)
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

9. 管道(Pipeline)

9.1 基础管道

python
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# 创建管道
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=10)),
    ('classifier', LogisticRegression())
])

# 训练管道
pipe.fit(X_train, y_train)

# 预测
y_pred = pipe.predict(X_test)

# 评估
accuracy = pipe.score(X_test, y_test)
print(f"管道准确率: {accuracy:.4f}")

9.2 复合管道

python
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd

# 创建混合数据
data = pd.DataFrame({
    'numeric1': np.random.randn(1000),
    'numeric2': np.random.randn(1000),
    'category1': np.random.choice(['A', 'B', 'C'], 1000),
    'category2': np.random.choice(['X', 'Y'], 1000)
})

# 定义预处理器
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['numeric1', 'numeric2']),
    ('cat', OneHotEncoder(drop='first'), ['category1', 'category2'])
])

# 创建完整管道
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# 使用管道进行网格搜索
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(full_pipeline, param_grid, cv=5)
# grid_search.fit(data, target)  # 需要目标变量

10. 实际项目示例

10.1 完整的分类项目

python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 假设我们有一个客户流失预测项目
def customer_churn_analysis():
    # 1. 数据加载和探索
    # data = pd.read_csv('customer_data.csv')
    
    # 2. 数据预处理
    def preprocess_data(data):
        # 处理缺失值
        data = data.fillna(data.median())
        
        # 编码分类变量
        le = LabelEncoder()
        categorical_columns = data.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            if col != 'target':  # 假设target是目标变量
                data[col] = le.fit_transform(data[col])
        
        return data
    
    # 3. 特征工程
    def feature_engineering(data):
        # 创建新特征
        data['total_charges_per_month'] = data['total_charges'] / data['tenure']
        data['services_count'] = (data[['phone_service', 'internet_service', 
                                       'online_security']].sum(axis=1))
        return data
    
    # 4. 模型训练和评估
    def train_and_evaluate(X, y):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # 特征缩放
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # 模型训练
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train_scaled, y_train)
        
        # 预测和评估
        y_pred = rf.predict(X_test_scaled)
        
        print("分类报告:")
        print(classification_report(y_test, y_pred))
        
        # 混淆矩阵
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('混淆矩阵')
        plt.ylabel('真实标签')
        plt.xlabel('预测标签')
        plt.show()
        
        # 特征重要性
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(10, 8))
        sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
        plt.title('前10个重要特征')
        plt.show()
        
        return rf, scaler

# customer_churn_analysis()  # 运行完整分析

11. 最佳实践

11.1 数据泄露防护

python
# 错误做法:在分割数据前进行预处理
# X_scaled = StandardScaler().fit_transform(X)  # 数据泄露!
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

# 正确做法:在分割数据后分别预处理
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # 只在训练集上fit
X_test_scaled = scaler.transform(X_test)        # 在测试集上只transform

11.2 模型持久化

python
import joblib

# 保存模型
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# 加载模型
loaded_model = joblib.load('model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# 使用加载的模型进行预测
new_data_scaled = loaded_scaler.transform(new_data)
predictions = loaded_model.predict(new_data_scaled)

11.3 性能监控

python
import time
from sklearn.metrics import accuracy_score

def benchmark_models(models, X_train, X_test, y_train, y_test):
    results = []
    
    for name, model in models.items():
        # 训练时间
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time
        
        # 预测时间
        start_time = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_time
        
        # 准确率
        accuracy = accuracy_score(y_test, y_pred)
        
        results.append({
            'model': name,
            'accuracy': accuracy,
            'train_time': train_time,
            'predict_time': predict_time
        })
    
    return pd.DataFrame(results)

# 使用示例
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression()
}

# benchmark_results = benchmark_models(models, X_train, X_test, y_train, y_test)
# print(benchmark_results)

12. 总结

Scikit-learn 是Python机器学习生态系统的核心库,提供了完整的机器学习工具链。从数据预处理到模型评估,sklearn都提供了简洁一致的API。