机器学习第七天 逐步分析昨日的数字识别
KNN算法伪代码:
def classify0(inX, dataSet, labels, k): """ 距离度量 度量公式为欧氏距离 inX 测试样本的图像文本转化的向量 dataSet <class 'numpy.ndarray'> 所有训练样本的m*1024的矩阵 labels 存储0~9对应的index位置 len(labels)= m k 对查询点标签影响显著(效果拔群)。k值小的时候 近似误差小,估计误差大。 k值大 近似误差大,估计误差小。 本例中取 k=3,11个错误 若 k=2, 12个错误 k=4, 11个错误 k=5, 17个错误 """ # >>print(type(dataSet)) # <class 'numpy.ndarray'> # shape函数是numpy.core.fromnumeric中的函数,它的功能是读取矩阵的长度,比如shape[0]就是读取矩阵第一维度的长度。 dataSetSize = dataSet.shape[0] # 原型:numpy.tile(A,reps) # tile共有2个参数,A指待输入数组,reps则决定A重复的次数。整个函数用于重复数组A来构建新的数组。 # 生成intX * dataSetSize的矩阵 # 例: inX = [1,2,3] # dataSetSize = 3 # >> print(tile(inX, (dataSetSize, 1))) # [[3 3 3] # [3 3 3] # [3 3 3]] diffMat = tile(inX, (dataSetSize, 1)) - dataSet #欧式距离计算 sqDiffMat = diffMat ** 2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances ** 0.5 # argsort函数返回的是数组值从小到大的索引值,将距离排序:从小到大 sortedDistIndicies = distances.argsort() # 选取前K个最短距离, 选取这K个中最多的分类类别 classCount = {} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0]
def handwritingClassTest(): # 1. 导入训练数据 hwLabels = [] trainingFileList = listdir( '/Users/xiehao/Desktop/MachineLearning-master/input/2.KNN/trainingDigits/') # load the training set # 训练集的数量 m = len(trainingFileList) # 构造 ,m*1024的矩阵 trainingMat = zeros((m, 1024)) # hwLabels存储0~9对应的index位置, trainingMat存放的每个位置对应的图片向量 for i in range(m): #例: 文件5_135.txt fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] # take off .txt #例:classNumStr =5 classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) # 将 32*32的矩阵->1*1024的矩阵 # trainingMat填充1*1024矩阵 trainingMat[i, :] = img2vector( '/Users/xiehao/Desktop/MachineLearning-master/input/2.KNN/trainingDigits/%s' % fileNameStr) # 2. 导入测试数据 testFileList = listdir( '/Users/xiehao/Desktop/MachineLearning-master/input/2.KNN/testDigits/') # iterate through the test set errorCount = 0.0 # 测试样本数量 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] # take off .txt classNumStr = int(fileStr.split('_')[0]) # 将测试样本的32*32的矩阵-->1*1024的矩阵 vectorUnderTest = img2vector( '/Users/xiehao/Desktop/MachineLearning-master/input/2.KNN/testDigits/%s' % fileNameStr) #执行上面定义的算法 classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)) if (classifierResult != classNumStr): errorCount += 1.0 print("\nthe total number of errors is: %d" % errorCount) print("\nthe total error rate is: %f" % (errorCount / float(mTest)))
作者:raphah
链接:https://www.jianshu.com/p/3af7bcb2d0dd