算法原理
将每个新数据与样本集中的每个特征进行比较,计算距离。把距离递增排序选取前k个距离最小的点,并统计前k个点的标签的出现频率,将出现频率最高的标签作为当前数据的预测分类。
此处使用欧氏距离公式:
代码:
import numpy
as np
import operator
import matplotlib
import matplotlib
.pylab
as plt
"""
函数说明:
创建数据集的例子
参数:
无
返回值:
group:数据集
labels:标签类别
"""
def createDataSet():
group
= np
.array
([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels
= ['A', 'A', 'B', 'B']
return group
, labels
"""
函数说明:
kNN分类器,k-近邻算法
计算已知类别点和当前点的距离,从前k个距离最小的点中,选取出现频率最高的类别作为当前点的类别
参数:
inx: 用于分类的数据,测试集
dataset: 训练样本集
labels: 标签类别
k: 选取最近邻近的数目
返回值:
sortClassCount[0][0]: 分类结果
"""
def classify0(inx
, dataSet
, labels
, k
):
dataSetSize
= dataSet
.shape
[0]
diffMat
= np
.tile
(inx
, (dataSetSize
, 1)) - dataSet
sqDiffMat
= diffMat
**2
sqDistances
= sqDiffMat
.sum(axis
=1)
distance
= sqDistances
**0.5
sortedDistIndicies
= distance
.argsort
()
classCount
= {}
for i
in range(k
):
voteIlabel
= labels
[sortedDistIndicies
[i
]]
classCount
[voteIlabel
] = classCount
.get
(voteIlabel
, 0) + 1
print(classCount
)
sortClassCount
= sorted(classCount
.items
(),
key
=operator
.itemgetter
(1), reverse
=True)
print(sortClassCount
)
return sortClassCount
[0][0]
"""
函数说明:
打开文件,读取数据信息
参数:
filename: 文件名
返回值
returnMat: 特征矩阵
classLabelVector: 类别向量
"""
def file2matrix(filename
):
fr
= open(filename
)
arrayOLines
= fr
.readlines
()
numberOfLine
= len(arrayOLines
)
returnMat
= np
.zeros
((numberOfLine
, 3))
classLabelVector
= []
index
= 0
for line
in arrayOLines
:
line
= line
.strip
()
listFromLine
= line
.split
()
returnMat
[index
, :] = listFromLine
[0:3]
classLabelVector
.append
(int(listFromLine
[-1]))
index
+= 1
return returnMat
, classLabelVector
"""
函数说明:
归一化数值,将特征值转化为0~1区间内的值
公式:newValue = (oldValue - min) / (max - min)
参数:
dataSet: 特征矩阵
返回值:
normDataSet: 归一化后的特征矩阵
ranges: 数据范围
minVals: 最小值
"""
def autoNorm(dataSet
):
minVals
= dataSet
.min(0)
maxVals
= dataSet
.max(0)
ranges
= maxVals
- minVals
normDataSet
= np
.zeros
(np
.shape
(dataSet
))
m
= dataSet
.shape
[0]
normDataSet
= dataSet
- np
.tile
(minVals
, (m
, 1))
normDataSet
= normDataSet
/np
.tile
(ranges
, (m
, 1))
return normDataSet
, ranges
, minVals
"""
函数说明:
分类器测试
参数:
无
返回值:
无
"""
def datingClassTest():
hoRatio
= 0.1
datingDataMat
, datingLabels
= file2matrix
('datingTestSet2.txt')
normMat
, ranges
, minvals
= autoNorm
(datingDataMat
)
m
= normMat
.shape
[0]
numTestVecs
= int(m
*hoRatio
)
errorCount
= 0.0
for i
in range(numTestVecs
):
classifierResult
= classify0
(normMat
[i
, :], normMat
[numTestVecs
:m
, :], datingLabels
[numTestVecs
:m
], 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult
, datingLabels
[i
]))
if(classifierResult
!= datingLabels
[i
]):
errorCount
+= 1.0
print("the total error rate is: %f" % (errorCount
/float(numTestVecs
)))
datingClassTest
()
转载请注明原文地址: https://lol.8miu.com/read-10991.html