参考书籍《机器学习实战》
k-近邻算法(kNN)采用测量不同特征值之间的距离方法进行分类
优点:精度高,对异常值不敏感,无数据输入假设 缺点:计算复杂度高,空间复杂度高 使用数据范围:数值型和标称型
from numpy import * import operator # 创建数据集和标签 def createDataSet(): group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] return group, labels def classify0(inx, dataSet, labels, k): dataSize = dataSet.shape[0] # 取出样本个数 diffMat = tile(inx, (dataSize, 1)) - dataSet sqDiffMat = diffMat ** 2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances ** 0.5 print("distances is ", distances) sortedDistIndices = distances.argsort() classCount = {} for i in range(k): voteLabel = labels[sortedDistIndices[i]] classCount[voteLabel] = classCount.get(voteLabel, 0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) print("classCount is ", classCount) return sortedClassCount[0][0] i1 = [1.0, 1.2] Group, Labels = createDataSet() ret = classify0(i1, Group, Labels, 2) print(ret)