代码中用到的公式
#每一个Class分别在一个单独的py文件下
#KNN.py
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self,k):
#构造函数
"""初始化KNN分类器"""
assert k >= 1,"k must be valid"
self.k = k
#加下划线,表明是私有的成员变量
self.k = k
self._X_train = None
self._y_train = None
def fit(self,X_train,y_train):
"""根据训练集X_train和y_train训练kNN分类器"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert self.k <= X_train.shape[0],\
"the size of X_train must be at least k."
self._X_train = X_train
self._y_train = y_train
return self
#Sklearn 要求用户输入给预测模型的数据类型是二维矩阵
#SKlearn 返回的结果类型是np.array
def predict(self,X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self._X_train is not None and self._y_train is not None,\
"must fit before predict!"
assert X_predict.shape[1] == self._X_train.shape[1],\
"the feature number of X_predict must be equal to X_train"
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
#私有的predict函数
def _predict(self,x):
"""给定单个待预测数据x,返回x的预测结果值"""
assert x.shape[0] == self._X_train.shape[1],\
"the feature number of x must be equal to X_train"
#计算距离
distances = [
sqrt(np.sum((x_train - x) ** 2))
for x_train in self._X_train
]
#对距离进行排序
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
#投票
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def __repr__(self):
return "KNN(k = %d)" % self.k
#KNN.py的用法:
"""
1.基本用法
knn_clf = KNNClassifier(k = 6)
knn_clf.fit(X_train,y_train)
y_predict = knn_clf.predict(X_predict)
"""
"""
2.使用网格搜索来对KNN进行调参
#先对需要调参的超参数建立一个字典
param_grid = [
{
'weights':['uniform'],
'n_neighbors':[i for i in range(1,11)]
},
{
'weights':['distance'],
'n_neighbors':[i for i in range(1,11)],
'p': [i for i in range(1,6)]
}
]
knn_clf = KNNClassifier(k = 6)
knn_clf.fit(X_train,y_train)
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf,param_grid)
#使用grid_search来fit
grid_search.fit(X_train,y_train)
#过程比较耗时
grid_search.best_estimator_
#返回最佳模型对应的超参数
grid_search.best_score_
#返回最佳模型的评分
grid_search.best_params_
#返回最佳参数
knn_clf = grid_search.best_estimator_
#拿到最佳参数对应的分类器
y_predict = knn_clf.predict(X_test)
"""
#model_selection.py
import numpy as np
def train_test_split(X,y,test_ratio = 0.2,seed = None):
assert X.shape[0] == y.shape[0],\
"the size of X must be equal to the size of y"
assert 0.0 <= test_ratio <= 1.0,\
"test_ration must be valid"
#如果Seed不为空
if seed:
np.random.seed(seed)
shuffled_indexes = np.random.permutation(len(X))
#np.random.permutation(len(x))
#permutation:返回的是一个Array,这个Array是x的索引的随机排列
test_size = int(len(X) * test_ratio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
return X_train,X_test,y_train,y_test
#基于正规方程的多元线性回归
#LinearRegression.py
import numpy as np
from sklearn.metrics import r2_score
class LinearRegression:
def __init__(self):
"""初始化Linear Regression模型"""
self.coef_ = None
self.interception_ = None
self._theta = None
#训练过程
def fit_normal(self,X_train,y_train):
assert X_train.shape[0] == y_train.shape[0],\
"the size of X_train must be equal to the size of y_train"
X_b = np.hstack([np.ones((len(X_train),1)),X_train])
#正规方程
#求矩阵的逆 : np.linalg.inv
self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
#截距
self.interception_ = self._theta[0]
#权重
self.coef_ = self._theta[1:]
return self
#预测过程
def predict(self,X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self.intercept_ is not None and self.coef_ is not None,\
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_),\
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict),1)),X_predict])
return X_b.dot(self._theta)
def score(self,X_test,y_test):
y_predict = self.predict(X_test)
return r2_score(y_test,y_predict)
def __repr__(self):
return "LinearRegression()"