Coder_zheng
2019-05-29 21:59:37浏览 1919
import os
import pandas as pd
import numpy as np
import math
def train_data_reads(path):
data_directory = path + "/data"
#获取数据路径
data_name_list = os.listdir(data_directory)
file_name = data_name_list[0]
#数据的路径:data_path
data_path = data_directory + "/" + file_name
name,extension = file_name.split(".")
if extension == "csv":
try:
data = pd.read_csv(data_path,encoding = "gbk")
except:
data = pd.read_csv(data_path,encoding = "utf-8")
elif extension == "txt":
try:
data = pd.read_csv(data_path,encoding = "gbk",sep = "\t")
except:
data = pd.read_csv(data_path,encoding = "utf-8",sep = "\t")
else:
data = pd.read_excel(data_path)
return data
def feature_label_split(data):
#获取dataFrame的列名
name_list = data.columns.values.tolist()
label_name = name_list[len(name_list) - 1]
#将数据中label为空的数据删除
data = data[np.isnan(data[label_name]) == False]
#拆分特征与标签
x = data.drop([label_name],axis = 1)
y = data[label_name]
y = list(map(lambda x: 0 if x >= 7 else 1,y))
y = pd.DataFrame(y,index = data.index)
new_data = pd.concat([x,y],axis = 1)
return new_data
def main():
path = "E:/AnaLinReg/Data_upload_cls"
data = train_data_reads(path)
data = feature_label_split(data)
data.to_csv('D1.csv',encoding = 'utf-8')
print ('Done')
if __name__ == "__main__":
main()
#回归特征:最后一列是1~10的数字
#分类特征: 最后一列只有0和1
#回归特征的0~6,对应分类特征的1
#回归特征的7~10,对应分类特征的0