基于两种方式实现线性回归算法
1.调用Sklearn库函数
2.自己实现相关函数
参考数据集:波士顿房价预测
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 中文字体
mpl.rcParams['axes.unicode_minus'] = False # 负号
import warnings
warnings.filterwarnings('ignore')
def get_dataset2():
data = []
label = []
for line in open('F:/aiproject/aiprojects/info/data1.txt').readlines():
tmpLine = line.strip().split('|')
data.append( [float(tmpLine[2]),float(tmpLine[3])] ) #前两列是特征
label.append(int(tmpLine[4])) #第三列是标签
#转换之前,data的类型是:List label的类型是: List
#将Data,label转换成矩阵,很关键的一步
data = np.matrix(data)
label = np.matrix(label).transpose()
return data,label
#对数据集X进行归一化
def data_process(data):
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(data)
data = scaler.transform(data)
return data
#scaler要求输入的数据为array类型
#data,label被转换为Matrix类型
#考虑一下这样是否可行
#调试结果发现,可以
#划分训练集和测试集
def divided1(xdata, ydata, percent):
data_train,data_test,label_train,label_test = train_test_split(xdata,ydata,test_size = percent)
return data_train,data_test,label_train,label_test
#len(data1) = 577
#len(label1) = 577
#data11.shape --> (577,2)
#label1.shape --> (577,1)
#data_train.shape --> (403,2)
#data_test.shape --> (174,2)
#label_train.shape --> (403,1)
#label_test.shape --> (174,1)
#type: numpy.ndarray
#绘图函数
def figure(title,*datalist):
for jj in datalist:
plt.plot(jj[0],'-',label = jj[1],linewidth = 2)
plt.plot(jj[0],'o')
plt.grid()
plt.title(title)
plt.legend()
plt.show()
#预测函数1:基于Sklearn框架
# data_train,label_train,data_test,label_test
def predict1(x_train_data,y_train_data,x_test_data,y_test_data):
reg = linear_model.LinearRegression()
#开始训练
reg.fit(x_train_data,y_train_data)
#输入测试集,预测数据的预测值
test_pre = reg.predict(x_test_data)
#输入训练集,训练数据的预测值
train_pre = reg.predict(x_train_data)
train_error = [ mean_squared_error(y_train_data,[np.mean(y_train_data)] * len(y_train_data)),
mean_squared_error(y_train_data,train_pre)
]
#绘制误差图
figure('误差图 最终的MSE = %.4f' % (train_error[-1]),[train_error,'error'])
#绘制预测值与真实值图
figure('预测值与真实值图 模型的' + r'$R^2=%.4f$' % (r2_score(train_pre, y_train_data)), [test_pre, '预测值'],
[y_test_data, '真实值'])
plt.show()
#线性回归的参数
#print('线性回归的系数为:\n w = %s \n b = %s' % (reg.coef_,reg.intercept_))
#自己编程搭建模型
# xdata = data_train [432,2]
# ydata = label_train [432,1]
class LinearRegression:
def __init__(self,theta = 0.2, iter_times =200000,error = 1e-9):
self.theta = theta
self.iter_times = iter_times
self.error = error
def Trans(self,xdata):
#w和b合为一个参数,也就是x最后加上一列全为1的数据
# y = wx + b
# [W,B],[X,1]
one1 = np.ones(len(xdata))
xta = np.append(xdata,one1.reshape(-1,1),axis = 1)
return xta #[432,3]
#梯度下降算法
def Gradient(self,xdata,ydata):
#X --> [X,1]
xdata = self.Trans(xdata) #xdata = [432,3]
#初始化 weights
self.weights = np.zeros((xdata.shape[1] , 1)) #weights = [3,1]
#存储损失函数的值
cost_function = []
for i in range(self.iter_times):
#得到回归的值
y_predict = np.dot(xdata,self.weights) #y_predict = [432,1]
#最小二乘法计算误差
cost = np.sum((y_predict - ydata).T * (y_predict - ydata)) / len(xdata) # cost = [432,1],?
cost_function.append(cost)
#计算梯度
grad = 2 * np.dot(xdata.T,(y_predict - ydata)) / len(xdata) # grad : [3,432] * [432,1] = [3,1]
#更新w,b的值
self.weights = self.weights - self.theta * grad
#提前结束循环的机制
if len(cost_function) > 1:
if 0 < cost_function[-2] - cost_function[-1] < self.error:
break
return self.weights,cost_function
#预测
def predict2(self,xdata):
return np.dot(self.Trans(xdata),self.weights) #[432,3] * [1,1]
#计算R2的函数
def getR(self,ydata_tr,ydata_pre):
sum_error = np.sum((ydata_tr - np.mean(ydata_tr)).T * (ydata_tr - np.mean(ydata_tr)))
inexplicable = np.sum((ydata_tr - ydata_pre).T * (ydata_tr - ydata_pre))
#sum_error = np.sum(((ydata_tr - np.mean(ydata_tr)) ** 2))
#inexplicable = np.sum(((ydata_tr - ydata_pre) ** 2))
return 1 - inexplicable / sum_error
#根据公式
""" def Formula(self, xdata, ydata):
xdata = self.Trans(xdata)
self.weights = np.dot(np.dot(np.linalg.inv(np.dot(xdata.T, xdata)), xdata.T), ydata)
y_predict = np.dot(xdata, self.weights)
cost = [np.sum((ydata - np.mean(ydata)) ** 2) / len(xdata)] # 开始是以y值得平均值作为预测值计算cost
cost += [np.sum((y_predict - ydata) ** 2) / len(xdata)] # 利用公式,一次计算便得到参数的值,不需要迭代。
return self.weights, cost # 包括2个值
"""
#用sklearn库实现线性回归
"""def main():
data1 = []
label1 = []
data1,label1 = get_dataset2()
data1 = data_process(data1)
data_train = []
data_test = []
label_train = []
label_test = []
percent1 = 0.25
data_train,data_test,label_train,label_test = divided1(data1,label1,percent1)
#data_train = np.matrix(data_train)
#data_test = np.matrix(data_test)
predict1(data_train,label_train,data_test,label_test)
"""
#自己搭建模型实现逻辑回归
def main():
data1 = []
label1 = []
data1,label1 = get_dataset2()
data1 = data_process(data1)
data_train = []
data_test = []
label_train = []
label_test = []
percent1 = 0.25
data_train,data_test,label_train,label_test = divided1(data1,label1,percent1)
data_train = np.matrix(data_train) #[432,2]
data_test = np.matrix(data_test) #[145,2]
label_train = np.matrix(label_train) #[432,1]
label_test = np.matrix(label_test) #[145,1]
regressor = LinearRegression()
# 开始训练
train_error = regressor.Gradient(data_train,label_train)
#输入测试集,预测数据的预测值
test_pre = regressor.predict2(data_test)
#输入训练集,训练数据的预测值
train_pre = regressor.predict2(data_train)
#绘制误差图
figure('误差图 最终的MSE = %.4f' % (train_error[1][-1]), [train_error[1], 'error'])
figure('预测值与真实值图 模型的' + r'$R^2=%.4f$' % (regressor.getR(label_train,train_pre)), [test_pre,'预测值'],
[label_test, '真实值'])
plt.show()
if __name__ == "__main__":
main()
热门评论
改进:对训练集和测试集划分时最好使用StratifiedShuffleSplit,可以减少过拟合