文章目录
1.朴素贝叶斯的基本方法2.朴素贝叶斯的参数估计1.极大似然估计2.朴素贝叶斯算法3.贝叶斯估计
3.后验概率最大化-期望风险最小化4.朴素贝叶斯代码实现
1.朴素贝叶斯的基本方法
2.朴素贝叶斯的参数估计
1.极大似然估计
2.朴素贝叶斯算法
3.贝叶斯估计
用极大似然估计可能出现所要估计的概率值为0的情况,这是会影响到后验概率的计算结果,使分类产生偏差。解决这一问题的方法就是采用贝叶斯估计,原理是在分子、分母中加上指定数值,使得不同项之间大小关系不变,,但消除了分母为0的可能性
3.后验概率最大化-期望风险最小化
4.朴素贝叶斯代码实现
"""
@author: liujie
@software: PyCharm
@file: natives.py
@time: 2020/10/21 17:18
"""
import time
import numpy
as np
from tqdm
import tqdm
def loaddata(filename
):
"""
加载数据集
:param filename: 文件路径
:return: 数据与标签
"""
dataArr
= []
labelArr
= []
fr
= open(filename
)
for line
in tqdm
(fr
.readlines
()):
curLine
= line
.strip
().split
(',')
dataArr
.append
([int(int(num
) > 128) for num
in curLine
[1:]])
labelArr
.append
(int(curLine
[0]))
return dataArr
, labelArr
def NaivesBayes(Py
, Px_y
, x
):
"""
通过贝叶斯进行概率估计
:param Py: 先验概率分布
:param Px_y: 条件概率分布
:param x: 要估计的样本
:return: 返回所有label的估计概率
"""
featureNum
= 784
classNum
= 10
P
= [0] * classNum
for i
in range(classNum
):
sum = 0
for j
in range(featureNum
):
sum += Px_y
[i
][j
][x
[j
]]
P
[i
] = sum + Py
[i
]
return P
.index
(max(P
))
def model_test(Py
, Px_y
, testDataArr
, testLabelArr
):
"""
对数据集进行测试
:param Py: 先验概率
:param Px_y: 条件概率
:param testDataArr:测试数据集
:param testLabelArr: 测试数据标签
:return: 返回正确率
"""
errorCnt
= 0
for i
in tqdm
(range(len(testDataArr
))):
predict
= NaivesBayes
(Py
, Px_y
, testDataArr
[i
])
if predict
!= testLabelArr
[i
]:
errorCnt
+= 1
return 1 - (errorCnt
/ len(testDataArr
))
def getAllProbability(trainDataArr
, trainLabelArr
):
"""
通过训练集获得先验概率与条件概率
:param trainDataArr: 训练数据集
:param trainLabelArr: 训练数据集标签
:return: 返回先验概率与条件概率
"""
featureNum
= 784
labelNum
= 10
Py
= np
.zeros
((labelNum
, 1))
for i
in range(labelNum
):
Py
[i
] = (np
.sum(np
.mat
(trainLabelArr
) == True) + 1) / (len(trainLabelArr
) + 10)
Py
= np
.log
(Py
)
Px_y
= np
.zeros
((labelNum
, featureNum
, 2))
for i
in range(len(trainLabelArr
)):
label
= trainLabelArr
[i
]
x
= trainDataArr
[i
]
for j
in range(featureNum
):
Px_y
[label
][j
][int(x
[j
])] += 1
for label
in range(labelNum
):
for j
in range(featureNum
):
Px_y0
= Px_y
[label
][j
][0]
Px_y1
= Px_y
[label
][j
][1]
Px_y
[label
][j
][0] = np
.log
((Px_y0
+ 1) / (Px_y0
+ Px_y1
+ 2))
Px_y
[label
][j
][1] = np
.log
((Px_y1
+ 1) / (Px_y0
+ Px_y1
+ 2))
return Py
, Px_y
if __name__
== '__main__':
start
= time
.time
()
print('start to read trainSet')
trainDataArr
, trainLabelArr
= loaddata
('data/mnist_train.csv')
print('start to read testSet')
testDataArr
, testLabelArr
= loaddata
('data/mnist_test.csv')
print('start to train')
Py
, Px_y
= getAllProbability
(trainDataArr
, trainLabelArr
)
print('start to test')
accuracy
= model_test
(Py
, Px_y
, testDataArr
, testLabelArr
)
print('accuracy = ', accuracy
)
end
= time
.time
()
print('time=', end
- start
)
start to read trainSet
100%|██████████
| 60000/60000 [00:17<00:00, 3379.31it
/s
]
start to read testSet
100%|██████████
| 10000/10000 [00:02<00:00, 3418.50it
/s
]
start to train
start to test
100%|██████████
| 10000/10000 [00:41<00:00, 242.28it
/s
]
accuracy
= 0.8435
time
= 102.47366952896118