文章目录
1.课本课后习题2.视频课后习题1.KNN自编程2.sklearn实现
1.课本课后习题
import numpy
as np
import matplotlib
.pyplot
as plt
from matplotlib
.colors
import ListedColormap
from sklearn
.neighbors
import KNeighborsClassifier
x
= np
.array
(
[[0.5, 0.9], [0.7, 2.8], [1.3, 4.6], [1.4, 2.8], [1.7, 0.8], [1.9, 1.4], [2, 4], [2.3, 3], [2.5, 2.5], [2.9, 2],
[2.9, 3], [3, 4.5], [3.3, 1.1], [4, 3.7], [4, 2.2], [4.5, 2.5], [4.6, 1], [5, 4]])
y
= np
.array
([0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0])
x_min
, x_max
= 0, 6
y_min
, y_max
= 0, 6
cmap_light
= ListedColormap
(['#FFFFFF', '#BDBDBD'])
h
= .01
xx
, yy
= np
.meshgrid
(np
.arange
(x_min
, x_max
, h
), np
.arange
(y_min
, y_max
, h
))
knn
= KNeighborsClassifier
(n_neighbors
=2)
knn
.fit
(x
, y
)
Z
= knn
.predict
(np
.c_
[xx
.ravel
(), yy
.ravel
()])
Z
= Z
.reshape
(xx
.shape
)
plt
.figure
()
plt
.xticks
(tuple([x
for x
in range(6)]))
plt
.yticks
(tuple([y
for y
in range(6) if y
!= 0]))
plt
.pcolormesh
(xx
, yy
, Z
, cmap
=cmap_light
)
plt
.xlabel
('$x^{(1)}$')
plt
.ylabel
('$x^{(2)}$')
plt
.scatter
(x
[:, 0], x
[:, 1], c
=y
)
plt
.show
()
k=2 k=1 总体来说,k值的减小意味着整体模型变得复杂,容易发生过拟合;k值的增大意味着整体模型变得简单,可以减少学习的估计误差,但是学习的近似误差会增大。通常采用交叉验证法来选取最优的k值。关于k值的选择可以参考书本上3.2.3章节。
2.视频课后习题
k近邻算法的模型复杂度主要体现在k值上
k值比较小时,容易造成过拟合
k值比较大时,容易造成欠拟合
1.KNN自编程
"""
@author: liujie
@software: PyCharm
@file: KNN自编程.py
@time: 2020/10/21 12:58
"""
import numpy
as np
from collections
import Counter
import matplotlib
.pyplot
as plt
class KNN():
def __init__(self
,x_train
,y_train
,k
=3):
self
.k
= k
self
.x_train
= x_train
self
.y_train
= y_train
def predict(self
,x_test
):
dist_list
= [(np
.linalg
.norm
(x_test
-self
.x_train
[i
],2),self
.y_train
[i
])
for i
in range(self
.x_train
.shape
[0])]
dist_list
.sort
(key
=lambda x
:x
[0])
y_list
= [dist_list
[i
][-1] for i
in range(self
.k
)]
y_count
= Counter
(y_list
).most_common
()
return y_count
[0][0]
def draw(X_train
,y_train
,X_new
):
X_po
=np
.zeros
(X_train
.shape
[1])
X_ne
=np
.zeros
(X_train
.shape
[1])
for i
in range(y_train
.shape
[0]):
if y_train
[i
]==1:
X_po
=np
.vstack
((X_po
,X_train
[i
]))
else:
X_ne
=np
.vstack
((X_ne
,X_train
[i
]))
plt
.plot
(X_po
[1:,0],X_po
[1:,1],"g*",label
="1")
plt
.plot
(X_ne
[1:, 0], X_ne
[1:, 1], "rx", label
="-1")
plt
.plot
(X_new
[:, 0], X_new
[:, 1], "bo", label
="test_points")
for xy
in zip(X_new
[:, 0], X_new
[:, 1]):
plt
.annotate
("test{}".format(xy
),xy
)
plt
.axis
([0,10,0,10])
plt
.xlabel
("x1")
plt
.ylabel
("x2")
plt
.legend
()
plt
.show
()
def main():
x_train
= np
.array
([[5,4],
[9,6],
[4,7],
[2,3],
[8,1],
[7,2]])
y_train
= np
.array
([1,1,1,-1,-1,-1])
x_test
= np
.array
([[5,3]])
draw
(x_train
,y_train
,x_test
)
for k
in range(1,6,2):
clf
= KNN
(x_train
,y_train
,k
=k
)
y_predict
= clf
.predict
(x_test
)
print('k={},被分类为: {}'.format(k
,y_predict
))
if __name__
== '__main__':
main
()
k
=1,被分类为
: 1
k
=3,被分类为
: -1
k
=5,被分类为
: -1
线性扫描优化版本思路
:
- 多数据处理
for循环
多进程
- 距离用列表排序,浪费时间与空间
用堆这个数据结构
import numpy
as np
from collections
import Counter
from concurrent
import futures
import matplotlib
.pyplot
as plt
import heapq
import time
class KNN:
def __init__(self
, X_train
, y_train
, k
=3):
self
.k
= k
self
.X_train
= X_train
self
.y_train
= y_train
def predict_single(self
, x_new
):
dist_list
= [(-np
.linalg
.norm
(x_new
- self
.X_train
[i
], ord=2), self
.y_train
[i
], i
)
for i
in range(self
.k
)]
heapq
.heapify
(dist_list
)
for i
in range(self
.k
, self
.X_train
.shape
[0]):
dist_i
= (-np
.linalg
.norm
(x_new
- self
.X_train
[i
], ord=2), self
.y_train
[i
], i
)
if dist_i
[0] > dist_list
[0][0]:
heapq
.heappushpop
(dist_list
, dist_i
)
else:
continue
y_list
= [dist_list
[i
][1] for i
in range(self
.k
)]
y_count
= Counter
(y_list
).most_common
()
return y_count
[0][0]
def predict_many(self
, X_new
):
with futures
.ProcessPoolExecutor
(max_workers
=4) as executor
:
tasks
= [executor
.submit
(self
.predict_single
, X_new
[i
]) for i
in range(X_new
.shape
[0])]
done_iter
= futures
.as_completed
(tasks
)
res
= [future
.result
() for future
in done_iter
]
return res
def draw(X_train
, y_train
, X_new
):
X_po
= np
.zeros
(X_train
.shape
[1])
X_ne
= np
.zeros
(X_train
.shape
[1])
for i
in range(y_train
.shape
[0]):
if y_train
[i
] == 1:
X_po
= np
.vstack
((X_po
, X_train
[i
]))
else:
X_ne
= np
.vstack
((X_ne
, X_train
[i
]))
plt
.plot
(X_po
[1:, 0], X_po
[1:, 1], "g*", label
="1")
plt
.plot
(X_ne
[1:, 0], X_ne
[1:, 1], "rx", label
="-1")
plt
.plot
(X_new
[:, 0], X_new
[:, 1], "bo", label
="test_points")
for xy
in zip(X_new
[:, 0], X_new
[:, 1]):
plt
.annotate
("test{}".format(xy
), xy
)
plt
.axis
([0, 10, 0, 10])
plt
.xlabel
("x1")
plt
.ylabel
("x2")
plt
.legend
()
plt
.show
()
def main():
start
= time
.time
()
X_train
= np
.array
([[5, 4],
[9, 6],
[4, 7],
[2, 3],
[8, 1],
[7, 2]])
y_train
= np
.array
([1, 1, 1, -1, -1, -1])
X_new
= np
.array
([[5, 3], [9, 2]])
draw
(X_train
, y_train
, X_new
)
for k
in range(1, 6, 2):
clf
= KNN
(X_train
, y_train
, k
=k
)
y_predict
= clf
.predict_many
(X_new
)
print("k={},被分类为:{}".format(k
, y_predict
))
end
= time
.time
()
print("用时:{}s".format(round(end
- start
), 2))
if __name__
== "__main__":
main
()
from collections
import namedtuple
import numpy
as np
class Node(namedtuple
("Node","location left_child right_child")):
def __repr__(self
):
return str(tuple(self
))
class KdTree():
def __init(self
,k
=1,p
=2):
self
.k
=k
self
.p
=p
self
.kdtree
=None
def _fit(self
,X
,depth
=0):
try:
k
=X
.shape
[1]
except IndexError
as e
:
return None
axis
=depth
% k
X
=X
[X
[:,axis
].argsort
()]
median
=X
.shape
[0]//2
try:
X
[median
]
except IndexError
:
return None
return Node
(
location
=X
[median
],
left_child
=self
._fit
(X
[:median
],depth
+1),
right_child
=self
._fit
(X
[median
+1:],depth
+1)
)
def _search(self
,point
,tree
=None,depth
=0,best
=None):
if tree
is None:
return best
k
=point
.shape
[1]
if point
[0][depth
%k
]<tree
.location
[depth
%k
]:
next_branch
=tree
.left_child
else:
next_branch
=tree
.right_child
if not next_branch
is None:
best
=next_branch
.location
return self
._search
(point
,tree
=next_branch
,depth
=depth
+1,best
=best
)
def fit(self
,X
):
self
.kdtree
=self
._fit
(X
)
return self
.kdtree
def predict(self
,X
):
res
=self
._search
(X
,self
.kdtree
)
return res
def main():
KNN
=KdTree
()
X_train
=np
.array
([[5,4],
[9,6],
[4,7],
[2,3],
[8,1],
[7,2]])
KNN
.fit
(X_train
)
X_new
=np
.array
([[5,3]])
res
=KNN
.predict
(X_new
)
print(res
)
if __name__
=="__main__":
main
()
2.sklearn实现
import numpy
as np
from sklearn
.neighbors
import KNeighborsClassifier
def main():
x_train
= np
.array
([[5,4],
[9,6],
[4,7],
[2,3],
[8,1],
[7,2]])
y_train
= np
.array
([1,1,1,-1,-1,-1])
x_test
= np
.array
([[5,3]])
for k
in range(1,6,2):
clf
= KNeighborsClassifier
(n_neighbors
=k
,weights
='distance',n_jobs
=-1)
clf
.fit
(x_train
,y_train
)
y_predict
= clf
.predict
(x_test
)
print(clf
.predict_proba
(x_test
))
print('k={},被分类为: {}'.format(k
,y_predict
))
if __name__
== '__main__':
main
()
[[0. 1.]]
k
=1,被分类为
: [1]
[[0.43837481 0.56162519]]
k
=3,被分类为
: [1]
[[0.45986872 0.54013128]]
k
=5,被分类为
: [1]
当只考虑样本数N的影响时:
线性扫描:计算N个距离O
(N
)
kd tree
: 二叉树搜索O
(logN
)
随着维度d的增加,kd tree效率下降,实际上当维度d接近N时,二者效率相当