手记

线性回归算法实例

基于两种方式实现线性回归算法
1.调用Sklearn库函数
2.自己实现相关函数
参考数据集:波士顿房价预测

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 中文字体
mpl.rcParams['axes.unicode_minus'] = False # 负号


import warnings
warnings.filterwarnings('ignore')

def get_dataset2():
    data = []
    label = []
    for line in open('F:/aiproject/aiprojects/info/data1.txt').readlines():
        tmpLine = line.strip().split('|')
        data.append( [float(tmpLine[2]),float(tmpLine[3])] )   #前两列是特征
        label.append(int(tmpLine[4]))                          #第三列是标签
    #转换之前,data的类型是:List  label的类型是: List
    #将Data,label转换成矩阵,很关键的一步
    data  = np.matrix(data)
    label = np.matrix(label).transpose()
    return data,label

#对数据集X进行归一化
def data_process(data):
	scaler = MinMaxScaler(feature_range = (0,1))
	scaler.fit(data)
	data = scaler.transform(data)
	return data

	#scaler要求输入的数据为array类型
	#data,label被转换为Matrix类型
	#考虑一下这样是否可行
	#调试结果发现,可以

#划分训练集和测试集
def divided1(xdata, ydata, percent):
    data_train,data_test,label_train,label_test = train_test_split(xdata,ydata,test_size = percent)
    return data_train,data_test,label_train,label_test
    
#len(data1) = 577
#len(label1) = 577
#data11.shape --> (577,2)
#label1.shape --> (577,1)
#data_train.shape --> (403,2)
#data_test.shape --> (174,2)
#label_train.shape --> (403,1)
#label_test.shape --> (174,1)
#type: numpy.ndarray


#绘图函数
def figure(title,*datalist):
	for jj in datalist:
		plt.plot(jj[0],'-',label = jj[1],linewidth = 2)
		plt.plot(jj[0],'o')
	plt.grid()
	plt.title(title)
	plt.legend()
	plt.show()

#预测函数1:基于Sklearn框架
#            data_train,label_train,data_test,label_test
def predict1(x_train_data,y_train_data,x_test_data,y_test_data):
	reg = linear_model.LinearRegression()
	#开始训练
	reg.fit(x_train_data,y_train_data)
	#输入测试集,预测数据的预测值
	test_pre = reg.predict(x_test_data)
	#输入训练集,训练数据的预测值
	train_pre = reg.predict(x_train_data)

	train_error = [ mean_squared_error(y_train_data,[np.mean(y_train_data)] * len(y_train_data)),
					mean_squared_error(y_train_data,train_pre)
				   ]
	#绘制误差图
	figure('误差图 最终的MSE = %.4f' % (train_error[-1]),[train_error,'error'])

	#绘制预测值与真实值图
	figure('预测值与真实值图 模型的' + r'$R^2=%.4f$' % (r2_score(train_pre, y_train_data)), [test_pre, '预测值'],
           [y_test_data, '真实值'])
	plt.show()

#线性回归的参数
#print('线性回归的系数为:\n w = %s \n b = %s' % (reg.coef_,reg.intercept_))

#自己编程搭建模型
# xdata = data_train [432,2]
# ydata = label_train  [432,1]
class LinearRegression:

	def __init__(self,theta = 0.2, iter_times =200000,error = 1e-9):
		self.theta = theta
		self.iter_times = iter_times
		self.error = error

	def Trans(self,xdata):
		#w和b合为一个参数,也就是x最后加上一列全为1的数据
        # y = wx + b
        # [W,B],[X,1] 
		one1 = np.ones(len(xdata))
		xta = np.append(xdata,one1.reshape(-1,1),axis = 1)
		return xta   #[432,3]

	#梯度下降算法
	def Gradient(self,xdata,ydata):
		#X --> [X,1]
		xdata = self.Trans(xdata)                         #xdata = [432,3]
		#初始化 weights
		self.weights = np.zeros((xdata.shape[1] , 1))      #weights = [3,1]
		#存储损失函数的值
		cost_function = []   

		for i in range(self.iter_times):
			#得到回归的值
			y_predict = np.dot(xdata,self.weights)        #y_predict = [432,1]
			#最小二乘法计算误差
			cost  = np.sum((y_predict - ydata).T * (y_predict - ydata)) / len(xdata)  # cost = [432,1],?
			cost_function.append(cost)


			#计算梯度
			grad = 2 * np.dot(xdata.T,(y_predict - ydata))  /  len(xdata)  # grad : [3,432] * [432,1] = [3,1]

			#更新w,b的值
			self.weights = self.weights - self.theta * grad

			#提前结束循环的机制
			if len(cost_function) > 1:
				if 0 < cost_function[-2]  - cost_function[-1] < self.error:
					break
		return self.weights,cost_function
	#预测
	def predict2(self,xdata):
		return np.dot(self.Trans(xdata),self.weights)   #[432,3] * [1,1]
	#计算R2的函数
	def getR(self,ydata_tr,ydata_pre):
		sum_error = np.sum((ydata_tr - np.mean(ydata_tr)).T * (ydata_tr - np.mean(ydata_tr)))
		inexplicable = np.sum((ydata_tr - ydata_pre).T * (ydata_tr - ydata_pre))
		#sum_error = np.sum(((ydata_tr - np.mean(ydata_tr)) ** 2))
		#inexplicable = np.sum(((ydata_tr - ydata_pre) ** 2))
		return 1 - inexplicable / sum_error

    #根据公式
""" def Formula(self, xdata, ydata):
        xdata = self.Trans(xdata)
        self.weights = np.dot(np.dot(np.linalg.inv(np.dot(xdata.T, xdata)), xdata.T), ydata)
        y_predict = np.dot(xdata, self.weights)
        cost = [np.sum((ydata - np.mean(ydata)) ** 2) / len(xdata)]  # 开始是以y值得平均值作为预测值计算cost
        cost += [np.sum((y_predict - ydata) ** 2) / len(xdata)]  # 利用公式,一次计算便得到参数的值,不需要迭代。
        return self.weights, cost  # 包括2个值
"""

#用sklearn库实现线性回归
"""def main():
	data1 = []
	label1 = []
	data1,label1 = get_dataset2()
	data1 = data_process(data1)
	data_train = [] 
	data_test  = [] 
	label_train = []
	label_test  = []
	percent1 = 0.25
	data_train,data_test,label_train,label_test  = divided1(data1,label1,percent1)

	#data_train = np.matrix(data_train)
	#data_test  = np.matrix(data_test)
	
	predict1(data_train,label_train,data_test,label_test)
"""
#自己搭建模型实现逻辑回归
def main():
	data1 = []
	label1 = []
	data1,label1 = get_dataset2()
	data1 = data_process(data1)
	data_train = [] 
	data_test  = [] 
	label_train = []
	label_test  = []
	percent1 = 0.25
	data_train,data_test,label_train,label_test  = divided1(data1,label1,percent1)
	data_train = np.matrix(data_train)     #[432,2]
	data_test  = np.matrix(data_test)      #[145,2]
	label_train = np.matrix(label_train)   #[432,1]
	label_test  = np.matrix(label_test)    #[145,1]

	regressor = LinearRegression()
	# 开始训练
	train_error = regressor.Gradient(data_train,label_train)
	#输入测试集,预测数据的预测值
	test_pre = regressor.predict2(data_test)
	#输入训练集,训练数据的预测值
	train_pre = regressor.predict2(data_train)
	#绘制误差图
	figure('误差图 最终的MSE = %.4f' % (train_error[1][-1]), [train_error[1], 'error'])
	figure('预测值与真实值图 模型的' + r'$R^2=%.4f$' % (regressor.getR(label_train,train_pre)), [test_pre,'预测值'],
           [label_test, '真实值'])
	plt.show()


if __name__  == "__main__":
	main()

0人推荐
随时随地看视频
慕课网APP

热门评论

改进:对训练集和测试集划分时最好使用StratifiedShuffleSplit,可以减少过拟合

查看全部评论