手记

模仿sklearn进行机器学习算法的封装

代码中用到的公式


#每一个Class分别在一个单独的py文件下

#KNN.py
import numpy as np
from math import sqrt 
from collections import Counter

class KNNClassifier:
	def __init__(self,k):
		#构造函数
		"""初始化KNN分类器"""
		assert k >= 1,"k must be valid"
		self.k = k
		#加下划线,表明是私有的成员变量
		self.k = k
		self._X_train = None
		self._y_train = None

	def fit(self,X_train,y_train):
		"""根据训练集X_train和y_train训练kNN分类器"""
		assert X_train.shape[0] == y_train.shape[0], \
			"the size of X_train must be equal to the size of y_train"
		assert self.k <= X_train.shape[0],\
			"the size of X_train must be at least k."
		self._X_train = X_train
		self._y_train = y_train
		return self

	#Sklearn 要求用户输入给预测模型的数据类型是二维矩阵
	#SKlearn 返回的结果类型是np.array
	def predict(self,X_predict):
		"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
		assert self._X_train is not None and self._y_train is not None,\
			"must fit before predict!"
		assert X_predict.shape[1] == self._X_train.shape[1],\
			"the feature number of X_predict must be equal to X_train"

		y_predict = [self._predict(x) for x in X_predict]
		return np.array(y_predict)

	#私有的predict函数
	def _predict(self,x):
		"""给定单个待预测数据x,返回x的预测结果值"""
		assert x.shape[0] == self._X_train.shape[1],\
			"the feature number of x must be equal to X_train"
		#计算距离
		distances = [
		               sqrt(np.sum((x_train - x) ** 2)) 
					   for x_train in self._X_train
                    ]
        #对距离进行排序
		nearest = np.argsort(distances)
		topK_y = [self._y_train[i] for i in nearest[:self.k]]
		#投票
		votes = Counter(topK_y)
		return votes.most_common(1)[0][0]

	def __repr__(self):
		return "KNN(k = %d)"  % self.k 

    
    	


#KNN.py的用法:
"""
1.基本用法
knn_clf = KNNClassifier(k = 6)
knn_clf.fit(X_train,y_train)

y_predict = knn_clf.predict(X_predict)
"""

"""
2.使用网格搜索来对KNN进行调参
#先对需要调参的超参数建立一个字典
param_grid = [
	{
		'weights':['uniform'],
		'n_neighbors':[i for i in range(1,11)]
	},
	{
		'weights':['distance'],
		'n_neighbors':[i for i in range(1,11)],
		'p': [i for i in range(1,6)]
	}
]

knn_clf = KNNClassifier(k = 6)
knn_clf.fit(X_train,y_train)


from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf,param_grid)
#使用grid_search来fit
grid_search.fit(X_train,y_train)
#过程比较耗时
grid_search.best_estimator_
#返回最佳模型对应的超参数
grid_search.best_score_
#返回最佳模型的评分
grid_search.best_params_
#返回最佳参数
knn_clf = grid_search.best_estimator_
#拿到最佳参数对应的分类器
y_predict = knn_clf.predict(X_test)

"""


#model_selection.py
import numpy as np
def train_test_split(X,y,test_ratio = 0.2,seed = None):
	assert X.shape[0] == y.shape[0],\
		"the size of X must be equal to the size of y"
	assert 0.0 <= test_ratio <= 1.0,\
		"test_ration must be valid"

	#如果Seed不为空
	if seed:
		np.random.seed(seed)

	shuffled_indexes = np.random.permutation(len(X))
	#np.random.permutation(len(x))
	#permutation:返回的是一个Array,这个Array是x的索引的随机排列

	test_size = int(len(X) * test_ratio) 
	test_indexes = shuffled_indexes[:test_size]
	train_indexes = shuffled_indexes[test_size:]

	X_train = X[train_indexes]
	y_train = y[train_indexes]

	X_test = X[test_indexes]
	y_test = y[test_indexes]

	return X_train,X_test,y_train,y_test


#基于正规方程的多元线性回归
#LinearRegression.py
import numpy as np 
from sklearn.metrics import r2_score
class LinearRegression:
	def __init__(self):
		"""初始化Linear Regression模型"""
		self.coef_ = None
		self.interception_ = None
		self._theta = None

	#训练过程
	def fit_normal(self,X_train,y_train):
		assert X_train.shape[0] == y_train.shape[0],\
			"the size of X_train must be equal to the size of y_train"
		X_b = np.hstack([np.ones((len(X_train),1)),X_train])
			#正规方程
			#求矩阵的逆 : np.linalg.inv
		self._theta  = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
			#截距
		self.interception_ = self._theta[0]
			#权重
		self.coef_ = self._theta[1:]

		return self 

	#预测过程	
	def predict(self,X_predict):
		"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
		assert self.intercept_ is not None and self.coef_ is not None,\
			"must fit before predict!"
		assert X_predict.shape[1] == len(self.coef_),\
			"the feature number of X_predict must be equal to X_train"

		X_b = np.hstack([np.ones((len(X_predict),1)),X_predict])
		return X_b.dot(self._theta)

	def score(self,X_test,y_test):
		y_predict = self.predict(X_test)
		return r2_score(y_test,y_predict)

	def __repr__(self):
		return "LinearRegression()"




0人推荐
随时随地看视频
慕课网APP