参考文章:改善深层神经网络-初始化、正则化、梯度校验
至今为止,数据集的加载、决策边界的曲线绘制代码不熟悉,向量与字典的相互转化的代码没细看。
代码实现功能如下:
初始化参数: 1.1:使用0来初始化参数 1.2:使用随机数来初始化参数 1.3:使用抑梯度异常初始化参数(参见视频中的梯度消失和梯度爆炸)正则化模型: 2.1:使用二范数对二分类模型正则化,尝试避免过拟合。 2.2:使用随机删除节点的方法精简模型,同样是为了尝试避免过拟合。梯度校验 : 对模型使用梯度校验,检测它是否在梯度下降的过程中出现误差过大的情况。
1.初始化
innitialize.py
"""
初始化参数:
1:使用0来初始化参数
2:使用随机数来初始化参数
3:使用抑梯度异常初始化参数(参见视频中的梯度消失和梯度爆炸)
"""
import numpy
as np
import matplotlib
.pyplot
as plt
import Deep_Learning
.test2_1
.init_utils
plt
.rcParams
['figure.figsize'] = (7.0, 4.0)
plt
.rcParams
['image.interpolation'] = 'nearest'
plt
.rcParams
['image.cmap'] = 'gray'
train_X
, train_Y
, test_X
, test_Y
= Deep_Learning
.test2_1
.init_utils
.load_dataset
(is_plot
=True)
plt
.show
()
def initialize_parameters_zeros(layers_dims
):
"""
将模型的参数全部设置为0
:param layers_dims: -列表,模型的层数和对应每一层的节点的数量
:return: parameters -包含了所有W和b的字典(l属于[0,L])
Wl -权重矩阵,维度为(layers_dims[l], layers_dims[0])
bl -偏置向量,维度为(layers_dims[l], 1)
"""
parameters
= {}
L
= len(layers_dims
)
for l
in range(1, L
):
parameters
["W" + str(l
)] = np
.zeros
((layers_dims
[l
], layers_dims
[l
- 1]))
parameters
["b" + str(l
)] = np
.zeros
((layers_dims
[l
], 1))
assert (parameters
["W" + str(l
)].shape
) == ((layers_dims
[l
], layers_dims
[l
- 1]))
assert (parameters
["b" + str(l
)].shape
) == ((layers_dims
[l
], 1))
return parameters
print("====================测试initialize_parameters_zeros====================")
parameters
= initialize_parameters_zeros
([3, 2, 1])
print("W1 = " + str(parameters
["W1"]))
print("b1 = " + str(parameters
["b1"]))
print("W2 = " + str(parameters
["W2"]))
print("b2 = " + str(parameters
["b2"]))
def initialize_parameters_random(layers_dims
):
np
.random
.seed
(3)
parameters
= {}
L
= len(layers_dims
)
for l
in range(1, L
):
parameters
['W' + str(l
)] = np
.random
.rand
(layers_dims
[l
], layers_dims
[l
- 1]) * 10
parameters
['b' + str(l
)] = np
.zeros
((layers_dims
[l
], 1))
assert (parameters
["W" + str(l
)].shape
) == ((layers_dims
[l
],layers_dims
[l
- 1]))
assert (parameters
["b" + str(l
)].shape
) == ((layers_dims
[l
],1))
return parameters
print("====================测试initialize_parameters_random====================")
parameters
= initialize_parameters_random
([3, 2, 1])
print("W1 = " + str(parameters
["W1"]))
print("b1 = " + str(parameters
["b1"]))
print("W2 = " + str(parameters
["W2"]))
print("b2 = " + str(parameters
["b2"]))
def initialize_parameters_he(layers_dims
):
np
.random
.seed
(3)
parameters
= {}
L
= len(layers_dims
)
for l
in range(1, L
):
parameters
["W" + str(l
)] = np
.random
.randn
(layers_dims
[l
], layers_dims
[l
- 1]) * np
.sqrt
(2 / layers_dims
[l
- 1])
parameters
["b" + str(l
)] = np
.zeros
((layers_dims
[l
], 1))
assert (parameters
["W" + str(l
)].shape
) == ((layers_dims
[l
],layers_dims
[l
- 1]))
assert (parameters
["b" + str(l
)].shape
) == ((layers_dims
[l
],1))
return parameters
print("====================测试initialize_parameters_he====================")
parameters
= initialize_parameters_random
([2, 4, 1])
print("W1 = " + str(parameters
["W1"]))
print("b1 = " + str(parameters
["b1"]))
print("W2 = " + str(parameters
["W2"]))
print("b2 = " + str(parameters
["b2"]))
def model01(X
, Y
, learning_rate
=0.01, num_interations
=15000, print_cost
=True, initialization
="he", is_Plot
=True):
"""
实现一个三层的神经网络:LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
:param X: -输入数据,维度为(2,要训练/测试的数量)
:param Y: -标签,【0 | 1】,维度为(1,对应的是输入数据的标签)
:param learning_rate: -学习速率
:param num_interations: -迭代次数
:param print_cost: -是否打印成本值,每1000次打印一次
:param initialization: -字符串类型,初始化的类型【"zeros" | "random | "he"】
:param is_Plot: -是否绘制梯度下降的曲线图
:return: parameters -学习后的参数
"""
grads
= {}
costs
= []
m
= X
.shape
[1]
layers_dims
= [X
.shape
[0], 10, 5, 1]
if initialization
== "zeros":
parameters
= initialize_parameters_zeros
(layers_dims
)
elif initialization
== "random":
parameters
= initialize_parameters_random
(layers_dims
)
elif initialization
== "he":
parameters
= initialize_parameters_he
(layers_dims
)
else:
print("错误的初始化参数!程序退出")
exit
for i
in range(0, num_interations
):
a3
, cache
= Deep_Learning
.test2_1
.init_utils
.forward_propagation
(X
, parameters
)
cost
= Deep_Learning
.test2_1
.init_utils
.compute_loss
(a3
, Y
)
grads
= Deep_Learning
.test2_1
.init_utils
.backward_propagation
(X
, Y
, cache
)
parameters
= Deep_Learning
.test2_1
.init_utils
.update_parameters
(parameters
, grads
, learning_rate
)
if i
% 1000 == 0:
costs
.append
(cost
)
if print_cost
:
print("第" + str(i
) + "次迭代,成本值为:", np
.squeeze
(cost
))
if is_Plot
:
plt
.plot
(costs
)
plt
.ylabel
('cost')
plt
.xlabel
('iterations (per hundreds)')
plt
.title
("Learning rate = " + str(learning_rate
))
plt
.show
()
return parameters
parameters
= model01
(train_X
, train_Y
, initialization
="zeros")
print("训练集:")
predictions_train
= Deep_Learning
.test2_1
.init_utils
.predict
(train_X
, train_Y
, parameters
)
print("测试集:")
predictions_test
= Deep_Learning
.test2_1
.init_utils
.predict
(test_X
, test_Y
, parameters
)
print("predictions_train = " + str(predictions_train
))
print("predictions_test = " + str(predictions_test
))
plt
.title
("Model01 with Zeros initialization")
axes
= plt
.gca
()
axes
.set_xlim
([-1.5, 1.5])
axes
.set_ylim
([-1.5, 1.5])
Deep_Learning
.test2_1
.init_utils
.plot_decision_boundary
(lambda x
: Deep_Learning
.test2_1
.init_utils
.predict_dec
(parameters
, x
.T
), train_X
, train_Y
)
"""
1.零初始化结果显示为一条直线,学习率没有变化,模型没有学习,分类失败,零初始化导致神经网络无法打破对称性
2.随机初始化有助于打破对称,不同隐藏层单元可以学习到不同的参数,但是误差开始很高,这是由于具有较大的随机权重,若初始化参数没有很好的话会导致梯度消失、爆炸,也会减慢优化算法。
3.抑梯度异常初始化的结果很好
"""
查看数据集
1.1零初始化时的运行结果
====================测试initialize_parameters_zeros
====================
W1
= [[0. 0. 0.]
[0. 0. 0.]]
b1
= [[0.]
[0.]]
W2
= [[0. 0.]]
b2
= [[0.]]
第
0次迭代,成本值为:
0.6931471805599453
第
1000次迭代,成本值为:
0.6931471805599453
第
2000次迭代,成本值为:
0.6931471805599453
第
3000次迭代,成本值为:
0.6931471805599453
第
4000次迭代,成本值为:
0.6931471805599453
第
5000次迭代,成本值为:
0.6931471805599453
第
6000次迭代,成本值为:
0.6931471805599453
第
7000次迭代,成本值为:
0.6931471805599453
第
8000次迭代,成本值为:
0.6931471805599453
第
9000次迭代,成本值为:
0.6931471805599453
第
10000次迭代,成本值为:
0.6931471805599455
第
11000次迭代,成本值为:
0.6931471805599453
第
12000次迭代,成本值为:
0.6931471805599453
第
13000次迭代,成本值为:
0.6931471805599453
第
14000次迭代,成本值为:
0.6931471805599453
训练集:
Accuracy
: 0.5
测试集:
Accuracy
: 0.5
predictions_train
= [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0]]
predictions_test
= [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
零初始化结果显示为一条直线,学习率没有变化,模型没有学习,分类失败,零初始化导致神经网络无法打破对称性。
1.2随机初始化时的运行结果
====================测试initialize_parameters_random
====================
W1
= [[5.50797903 7.08147823 2.90904739]
[5.10827605 8.92946954 8.96293089]]
b1
= [[0.]
[0.]]
W2
= [[1.2558531 2.07242878]]
b2
= [[0.]]
第
0次迭代,成本值为: inf
第
1000次迭代,成本值为:
0.7621259370184228
第
2000次迭代,成本值为:
0.6690029827916156
第
3000次迭代,成本值为:
0.6513359019864347
第
4000次迭代,成本值为:
0.6470045598995088
第
5000次迭代,成本值为:
0.6447240738113086
第
6000次迭代,成本值为:
0.6409840974187322
第
7000次迭代,成本值为:
0.6278546078886772
第
8000次迭代,成本值为:
0.5964022514198947
第
9000次迭代,成本值为:
0.5493267350966603
第
10000次迭代,成本值为:
0.5069866933465077
第
11000次迭代,成本值为:
0.4899755730226258
第
12000次迭代,成本值为:
0.47892254564188913
第
13000次迭代,成本值为:
0.47441470732170654
第
14000次迭代,成本值为:
0.47138671562861467
训练集:
Accuracy
: 0.6833333333333333
测试集:
Accuracy
: 0.67
predictions_train
= [[1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1
1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0
1 1 0 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1
1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1
1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1
1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1
1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1
1 1 1 1 1 0 1 1 1 1 1 0]]
predictions_test
= [[1 1 1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1
0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1]]
随机初始化有助于打破对称,不同隐藏层单元可以学习到不同的参数,但是误差开始很高,这是由于具有较大的随机权重,若初始化参数没有很好的话会导致梯度消失、爆炸,也会减慢优化算法。
1.3抑梯度异常初始化运行结果
====================测试initialize_parameters_he
====================
W1
= [[5.50797903 7.08147823]
[2.90904739 5.10827605]
[8.92946954 8.96293089]
[1.2558531 2.07242878]]
b1
= [[0.]
[0.]
[0.]
[0.]]
W2
= [[0.51467203 4.40809844 0.29876211 4.56833224]]
b2
= [[0.]]
第
0次迭代,成本值为:
0.8830537463419761
第
1000次迭代,成本值为:
0.6879825919728063
第
2000次迭代,成本值为:
0.6751286264523371
第
3000次迭代,成本值为:
0.6526117768893807
第
4000次迭代,成本值为:
0.6082958970572938
第
5000次迭代,成本值为:
0.5304944491717495
第
6000次迭代,成本值为:
0.4138645817071794
第
7000次迭代,成本值为:
0.3117803464844441
第
8000次迭代,成本值为:
0.23696215330322562
第
9000次迭代,成本值为:
0.18597287209206836
第
10000次迭代,成本值为:
0.15015556280371817
第
11000次迭代,成本值为:
0.12325079292273552
第
12000次迭代,成本值为:
0.09917746546525932
第
13000次迭代,成本值为:
0.08457055954024274
第
14000次迭代,成本值为:
0.07357895962677362
训练集:
Accuracy
: 0.9933333333333333
测试集:
Accuracy
: 0.96
predictions_train
= [[1 0 1 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 1 0 1 1 0 1 1 0 0 0
0 1 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 0
0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 1 0
0 0 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0
0 0 1 0 1 0 1 0 1 1 1 0 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 0 1 0
1 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 1 1 1 0 1 0 1 0 0 1
0 1 0 0 0 1 1 1 1 0 1 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 1 0
1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 1 1 1 1
1 1 1 0 0 0 0 1 1 0 1 0]]
predictions_test
= [[1 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 1 1 0 1 0 0 1
0 1 1 0 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0
1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0]]
抑梯度异常初始化的结果很好
2.正则化
regularization.py
"""
正则化模型:
1:使用二范数对二分类模型正则化,尝试避免过拟合。
2:使用随机删除节点的方法精简模型,同样是为了尝试避免过拟合。
我们要做的就是使用模型画一条线将红色和蓝色分界,分别使用以下三种
1.不使用正则化
2.使用正则化
2.1使用L2正则化:将lambd输入设置为非零值,不使用lambda是因为其是python的关键保留字
2.2使用随机节点删除:将keep_prob设置为小于1的值
"""
import numpy
as np
import matplotlib
.pyplot
as plt
import Deep_Learning
.test2_1
.reg_utils
plt
.rcParams
['figure.figsize'] = (7.0, 4.0)
plt
.rcParams
['image.interpolation'] = 'nearest'
plt
.rcParams
['image.cmap'] = 'gray'
train_X
, train_Y
, test_X
, test_Y
= Deep_Learning
.test2_1
.reg_utils
.load_2D_dataset
(is_plot
=True)
plt
.show
()
"""
我们要做的就是使用模型画一条线将红色和蓝色分界,分别使用以下三种
1.不使用正则化
2.使用正则化
2.1使用L2正则化:将lambd输入设置为非零值,不使用lambda是因为其是python的关键保留字
2.2使用随机节点删除:将keep_prob设置为小于1的值
"""
def model02(X
,Y
,learning_rate
=0.3,num_iterations
=30000,print_cost
=True,is_Plot
=True,lambd
=0,keep_prob
=1):
"""
实现一个三层的神经网络:LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
:param X: -输入数据,维度为(2,要训练/测试的数量)
:param Y: -标签,【0(蓝色) | 1(红色)】,维度为(1,对应的是输入数据的标签)
:param learning_rate: -学习速率
:param num_iterations: -迭代次数
:param print_cost: -是否打印成本值,每迭代10000次打印一次,但是每1000次记录一个成本值
:param is_Plot: -是否绘制梯度下降的曲线图
:param lambd: -正则化的超参数、实数
:param keep_prob: -随机删除节点的概率
:return: parameters -学习后的参数
"""
grads
= {}
costs
= []
m
= X
.shape
[1]
layers_dims
= [X
.shape
[0],20,3,1]
parameters
= Deep_Learning
.test2_1
.reg_utils
.initialize_parameters
(layers_dims
)
for i
in range(0,num_iterations
):
if keep_prob
== 1:
a3
,cache
= Deep_Learning
.test2_1
.reg_utils
.forward_propagation
(X
,parameters
)
elif keep_prob
< 1:
a3
,cache
= forward_propagation_with_dropout
(X
,parameters
,keep_prob
)
else:
print("keep_prob参数错误,程序退出")
if lambd
== 0:
cost
= Deep_Learning
.test2_1
.reg_utils
.compute_cost
(a3
,Y
)
else:
cost
= compute_cost_with_regularization
(a3
,Y
,parameters
,lambd
)
assert (lambd
== 0 or keep_prob
== 1)
if (lambd
== 0 and keep_prob
== 1):
grads
= Deep_Learning
.test2_1
.reg_utils
.backward_propagation
(X
,Y
,cache
)
elif lambd
!= 0:
grads
= backward_propagation_with_regularization
(X
,Y
,cache
,lambd
)
elif keep_prob
< 1:
grads
= backward_propagation_with_dropout
(X
,Y
,cache
,keep_prob
)
parameters
= Deep_Learning
.test2_1
.reg_utils
.update_parameters
(parameters
,grads
,learning_rate
)
if i
% 1000 == 0:
costs
.append
(cost
)
if (print_cost
and i
% 10000 == 0):
print("第" + str(i
) + "次迭代,成本值为:" + str(cost
))
if is_Plot
:
plt
.plot
(costs
)
plt
.ylabel
('cost')
plt
.xlabel
('iterations(x1, 000)')
plt
.title
("Learning rate = " + str(learning_rate
))
plt
.show
()
return parameters
parameters
= model02
(train_X
,train_Y
)
print("训练集:")
predictions_train
= Deep_Learning
.test2_1
.reg_utils
.predict
(train_X
,train_Y
,parameters
)
print("测试集:")
predictions_test
= Deep_Learning
.test2_1
.reg_utils
.predict
(test_X
,test_Y
,parameters
)
plt
.title
("Model without regularization")
axes
= plt
.gca
()
axes
.set_xlim
([-0.75,0.40])
axes
.set_ylim
([-0.75,0.65])
Deep_Learning
.test2_1
.reg_utils
.plot_decision_boundary
(lambda x
: Deep_Learning
.test2_1
.reg_utils
.predict_dec
(parameters
,x
.T
),train_X
,train_Y
)
def compute_cost_with_regularization(A3
,Y
,parameters
,lambd
):
"""
实现L2正则化计算成本
:param A3: -正向传播的输出结果,维度为(输出节点数量,训练/测试的数量)
:param Y: -标签向量,与数据一一对应,维度为(输出节点数量,训练/测试的数量)
:param parameters: -包含模型学习后的参数的字典
:param lambd: -正则化的超参数、实数
:return: cost -正则化损失的值
"""
m
= Y
.shape
[1]
W1
= parameters
["W1"]
W2
= parameters
["W2"]
W3
= parameters
["W3"]
cross_entropy_cost
= Deep_Learning
.test2_1
.reg_utils
.compute_cost
(A3
,Y
)
L2_regularization_cost
= lambd
* (np
.sum(np
.square
(W1
)) + np
.sum(np
.square
(W2
)) + np
.sum(np
.square
(W3
))) / (2 * m
)
cost
= cross_entropy_cost
+ L2_regularization_cost
return cost
def backward_propagation_with_regularization(X
,Y
,cache
,lambd
):
"""
实现添加了L2正则化的模型的后向传播
:param X: -输入数据集,维度为(输入节点数量,数据集里面的数量)
:param Y: -标签,维度为(输出节点数量,数据集里面的数量)
:param cache: -来自forward_propagation()的cache输出
:param lambd: -regularization超参数,实数
:return: gradients -包含每个参数、激活值和预激活值变量的梯度的字典
"""
m
= X
.shape
[1]
(Z1
,A1
,W1
,b1
,Z2
,A2
,W2
,b2
,Z3
,A3
,W3
,b3
) = cache
dZ3
= A3
- Y
dW3
= (1 / m
) * np
.dot
(dZ3
,A2
.T
) + ((lambd
* W3
) / m
)
db3
= (1 / m
) * np
.sum(dZ3
,axis
=1,keepdims
=True)
dA2
= np
.dot
(W3
.T
,dZ3
)
dZ2
= np
.multiply
(dA2
,np
.int64
(A2
> 0))
dW2
= (1 / m
) * np
.dot
(dZ2
,A1
.T
) + ((lambd
* W2
) / m
)
db2
= (1 / m
) * np
.sum(dZ2
,axis
=1,keepdims
=True)
dA1
= np
.dot
(W2
.T
,dZ2
)
dZ1
= np
.multiply
(dA1
,np
.int64
(A1
> 0))
dW1
= (1 / m
) * np
.dot
(dZ1
,X
.T
) + ((lambd
* W1
) / m
)
db1
= (1 / m
) * np
.sum(dZ1
,axis
=1,keepdims
=True)
gradients
= {"dZ3": dZ3
,"dW3": dW3
,"db3": db3
,"dA2": dA2
,
"dZ2": dZ2
,"dW2": dW2
,"db2": db2
,"dA1": dA1
,
"dZ1": dZ1
,"dW1": dW1
,"db1": db1
}
return gradients
parameters
= model02
(train_X
,train_Y
,lambd
=0.7)
print("使用正则化,训练集:")
predictions_train
= Deep_Learning
.test2_1
.reg_utils
.predict
(train_X
,train_Y
,parameters
)
print("使用正则化,测试集:")
predictions_test
= Deep_Learning
.test2_1
.reg_utils
.predict
(test_X
,test_Y
,parameters
)
plt
.title
("Model with L2-regularization")
axes
= plt
.gca
()
axes
.set_xlim
([-0.75,0.40])
axes
.set_ylim
([-0.75,0.65])
Deep_Learning
.test2_1
.reg_utils
.plot_decision_boundary
(
lambda x
: Deep_Learning
.test2_1
.reg_utils
.predict_dec
(parameters
,x
.T
),train_X
,train_Y
)
"""
lambd的值是可以使用开发集调整时的超参数,L2正则化会使决策边界更加平滑,若lambd太大,会平滑过渡模型产生高偏差
L2正则化依赖于较小权重的模型比具有较大权重的模型更简单这样的假设,因此通过削弱成本函数中权重的平方值,可以将所有权重值逐渐变小
权重高的话会有更平滑的模型,其中输入变化时输出变化更慢,L2正则化对以下内容有影响
1.成本计算:正则化的计算需要添加到成本函数中
2.反向传播:在权重矩阵中,梯度计算时也要依据正则化作出相应的计算
3.重量变小(重量衰减):权重被逐渐改变到较小的值
"""
def forward_propagation_with_dropout(X
,parameters
,keep_prob
=0.5):
"""
实现具有随机舍弃节点的前向传播
LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID
:param X: -输入数据集,维度为(2,示例数)
:param parameters: -包含参数"W1","b1","W2","b2","W3","b3"的字典
:param keep_prob: -随机删除的概率,实数
:return: A3 -最后的激活值,维度为(1,1),正向传播的输出
cache -存储一些用于计算反向传播的数值的元组
"""
np
.random
.seed
(1)
W1
= parameters
["W1"]
b1
= parameters
["b1"]
W2
= parameters
["W2"]
b2
= parameters
["b2"]
W3
= parameters
["W3"]
b3
= parameters
["b3"]
Z1
= np
.dot
(W1
,X
) + b1
A1
= Deep_Learning
.test2_1
.reg_utils
.relu
(Z1
)
D1
= np
.random
.rand
(A1
.shape
[0],A1
.shape
[1])
D1
= D1
< keep_prob
A1
= A1
* D1
A1
= A1
/ keep_prob
"""
示例代码帮助理解:
import numpy as np
np.random.seed(1)
# ??????????rand和randn区别
A1 = np.random.randn(1, 3)
print("A1:", A1) # A1: [[ 1.62434536 -0.61175641 -0.52817175]]
D1 = np.random.rand(A1.shape[0], A1.shape[1])
print("D1:", D1) # D1: [[0.39676747 0.53881673 0.41919451]]
keep_prob = 0.5
D1 = D1 < keep_prob
print(D1) # [[ True False True]]
A1 = 0.01
A1 = A1 * D1
A1 = A1 / keep_prob
print(A1) # [[0.02 0. 0.02]]
"""
Z2
= np
.dot
(W2
,A1
) + b2
A2
= Deep_Learning
.test2_1
.reg_utils
.relu
(Z2
)
D2
= np
.random
.rand
(A2
.shape
[0],A2
.shape
[1])
D2
= D2
< keep_prob
A2
= A2
* D2
A2
= A2
/ keep_prob
Z3
= np
.dot
(W3
,A2
) + b3
A3
= Deep_Learning
.test2_1
.reg_utils
.sigmoid
(Z3
)
cache
= (Z1
,D1
,A1
,W1
,b1
,Z2
,D2
,A2
,W2
,b2
,Z3
,A3
,W3
,b3
)
return A3
,cache
def backward_propagation_with_dropout(X
,Y
,cache
,keep_prob
):
"""
:param X: -输入数据集,维度为(2,示例数)
:param Y: -标签,维度为(输出节点数量,示例数量)
:param cache: -来自forward_propagation_with_dropout()的cache输出
:param keep_prob: -随机删除的概率,实数
:return:gradients -一个关于每个参数,激活值和预激活变量的梯度值的字典
"""
m
= X
.shape
[1]
(Z1
,D1
,A1
,W1
,b1
,Z2
,D2
,A2
,W2
,b2
,Z3
,A3
,W3
,b3
) = cache
dZ3
= A3
- Y
dW3
= (1 / m
) * np
.dot
(dZ3
,A2
.T
)
db3
= (1. / m
) * np
.sum(dZ3
,axis
=1,keepdims
=True)
dA2
= np
.dot
(W3
.T
,dZ3
)
dA2
= dA2
* D2
dA2
= dA2
/ keep_prob
dZ2
= np
.multiply
(dA2
,np
.int64
(A2
> 0))
dW2
= 1. / m
* np
.dot
(dZ2
,A1
.T
)
db2
= 1. / m
* np
.sum(dZ2
,axis
=1,keepdims
=True)
dA1
= np
.dot
(W2
.T
,dZ2
)
dA1
= dA1
* D1
dA1
= dA1
/ keep_prob
dZ1
= np
.multiply
(dA1
,np
.int64
(A1
> 0))
dW1
= 1. / m
* np
.dot
(dZ1
,X
.T
)
db1
= 1. / m
* np
.sum(dZ1
,axis
=1,keepdims
=True)
gradients
= {"dZ3": dZ3
,"dW3": dW3
,"db3": db3
,"dA2": dA2
,
"dZ2": dZ2
,"dW2": dW2
,"db2": db2
,"dA1": dA1
,
"dZ1": dZ1
,"dW1": dW1
,"db1": db1
}
return gradients
parameters
= model02
(train_X
,train_Y
,keep_prob
=0.86,learning_rate
=0.3)
print("使用随机删除节点,训练集:")
predictions_train
= Deep_Learning
.test2_1
.reg_utils
.predict
(train_X
,train_Y
,parameters
)
print("使用随机删除节点,测试集:")
predictions_test
= Deep_Learning
.test2_1
.reg_utils
.predict
(test_X
,test_Y
,parameters
)
plt
.title
("Model with dropout")
axes
= plt
.gca
()
axes
.set_xlim
([-0.75,0.40])
axes
.set_ylim
([-0.75,0.65])
Deep_Learning
.test2_1
.reg_utils
.plot_decision_boundary
(
lambda x
: Deep_Learning
.test2_1
.reg_utils
.predict_dec
(parameters
,x
.T
),train_X
,train_Y
)
"""
正则化会把训练集的准确度降低,但是测试集的准确度提高了
"""
查看数据集
2.1无正则化的运行结果
第
0次迭代,成本值为:
0.6557412523481002
第
10000次迭代,成本值为:
0.16329987525724216
第
20000次迭代,成本值为:
0.1385164242327309
训练集:
Accuracy
: 0.9478672985781991
测试集:
Accuracy
: 0.915
无正则化时,分割曲线有明显的过拟合特征。
2.2使用L2正则化后的运行结果
第
0次迭代,成本值为:
0.6974484493131264
第
10000次迭代,成本值为:
0.2684918873282239
第
20000次迭代,成本值为:
0.2680916337127301
使用正则化,训练集:
Accuracy
: 0.9383886255924171
使用正则化,测试集:
Accuracy
: 0.93
lambd的值是可以使用开发集调整时的超参数,L2正则化会使决策边界更加平滑,若lambd太大,会平滑过渡模型产生高偏差 L2正则化依赖于较小权重的模型比具有较大权重的模型更简单这样的假设,因此通过削弱成本函数中权重的平方值,可以将所有权重值逐渐变小,权重高的话会有更平滑的模型,其中输入变化时输出变化更慢,L2正则化对以下内容有影响:
成本计算:正则化的计算需要添加到成本函数中反向传播:在权重矩阵中,梯度计算时也要依据正则化作出相应的计算重量变小(重量衰减):权重被逐渐改变到较小的值
2.3使用Dropout后的运行结果
第
0次迭代,成本值为:
0.6543912405149825
第
10000次迭代,成本值为:
0.0610169865749056
第
20000次迭代,成本值为:
0.060582435798513114
使用随机删除节点,训练集:
Accuracy
: 0.9289099526066351
使用随机删除节点,测试集:
Accuracy
: 0.95
正则化会把训练集的准确度降低,但是测试集的准确度提高了
3.梯度校验
gradient_check.py
"""
梯度校验:对模型使用梯度校验,检测它是否在梯度下降的过程中出现误差过大的情况。
"""
import numpy
as np
import Deep_Learning
.test2_1
.gc_utils
def forward_propagation(x
, theta
):
"""
实现线性前向传播(计算J)(J(theta) = theta * x)
:param x: -一个实值输入
:param theta: -参数,也是一个实数
:return: J -函数J的值
"""
J
= np
.dot
(theta
, x
)
return J
def backward_propagation(x
, theta
):
"""
计算J相对于theta的导数
:param x: -一个实值输入
:param theta: -参数,也是一个实数
:return: dtheta -相对于theta的成本梯度
"""
dtheta
= x
return dtheta
def gradient_check(x
, theta
, epsilon
=1e-7):
"""
:param x: -一个实值输入
:param theta: -参数,也是一个实数
:param epsilon: -计算输入的微笑偏移以计算近似梯度
:return: 近似梯度和后向传播梯度之间的差异
"""
thetaplus
= theta
+ epsilon
thetaminus
= theta
- epsilon
J_plus
= forward_propagation
(x
, thetaplus
)
J_minus
= forward_propagation
(x
, thetaminus
)
gradapprox
= (J_plus
- J_minus
) / (2 * epsilon
)
grad
= backward_propagation
(x
, theta
)
numerator
= np
.linalg
.norm
(grad
- gradapprox
)
denominator
= np
.linalg
.norm
(grad
) + np
.linalg
.norm
(gradapprox
)
difference
= numerator
/ denominator
if difference
< 1e-7:
print("一维线性梯度检查:梯度正常")
else:
print("一维线性梯度检查:梯度超出阈值")
return difference
print("=================测试gradient_check====================")
x
, theta
= 2, 4
difference
= gradient_check
(x
, theta
)
print("difference = " + str(difference
))
"""
运行结果:
一维线性梯度检查:梯度正常
difference = 2.919335883291695e-10
"""
def forward_propagation_n(X
, Y
, parameters
):
"""
:param X: -训练集为m个例子
:param Y: -m个示例的标签
:param parameters: -包含参数"W1","b1","W2","b2","W3","b3"的字典(layers_dims=[4,5,3,1])
:return: cost -成本函数
"""
m
= X
.shape
[1]
W1
= parameters
["W1"]
b1
= parameters
["b1"]
W2
= parameters
["W2"]
b2
= parameters
["b2"]
W3
= parameters
["W3"]
b3
= parameters
["b3"]
Z1
= np
.dot
(W1
, X
) + b1
A1
= Deep_Learning
.test2_1
.gc_utils
.relu
(Z1
)
Z2
= np
.dot
(W2
, A1
) + b2
A2
= Deep_Learning
.test2_1
.gc_utils
.relu
(Z2
)
Z3
= np
.dot
(W3
, A2
) + b3
A3
= Deep_Learning
.test2_1
.gc_utils
.relu
(Z3
)
logprobs
= np
.multiply
(-np
.log
(A3
), Y
) + np
.multiply
(-np
.log
(1 - A3
), 1 - Y
)
cost
= (1 / m
) * np
.sum(logprobs
)
cache
= (Z1
, A1
, W1
, b1
, Z2
, A2
, W2
, b2
, Z3
, A3
, W3
, b3
)
return cost
, cache
def backward_propagation_n(X
, Y
, cache
):
"""
:param X: -输入数据点(输入节点数,1)
:param Y: -标签
:param cache: -来自forward_propagation_n()的cache输出
:return: gradients -字典:包含每个参数、激活和激活前变量相关的成本梯度
"""
m
= X
.shape
[1]
(Z1
, A1
, W1
, b1
, Z2
, A2
, W2
, b2
, Z3
, A3
, W3
, b3
) = cache
dZ3
= A3
- Y
dW3
= (1. / m
) * np
.dot
(dZ3
, A2
.T
)
db3
= (1. / m
) * np
.sum(dZ3
, axis
=1, keepdims
=True)
dA2
= np
.dot
(W3
.T
, dZ3
)
dZ2
= np
.multiply
(dA2
, np
.int64
(A2
> 0))
dW2
= 1. / m
* np
.dot
(dZ2
, A1
.T
)
db2
= 1. / m
* np
.sum(dZ2
, axis
=1, keepdims
=True)
dA1
= np
.dot
(W2
.T
, dZ2
)
dZ1
= np
.multiply
(dA1
, np
.int64
(A1
> 0))
dW1
= 1. / m
* np
.dot
(dZ1
, X
.T
)
db1
= 1. / m
* np
.sum(dZ1
, axis
=1, keepdims
=True)
gradients
= {"dZ3": dZ3
,"dW3": dW3
,"db3": db3
,"dA2": dA2
,
"dZ2": dZ2
,"dW2": dW2
,"db2": db2
,"dA1": dA1
,
"dZ1": dZ1
,"dW1": dW1
,"db1": db1
}
return gradients
def gradient_check_n(parameters
, gradients
, X
, Y
, epsilon
=1e-7):
"""
检查backward_propagation_n是否正确计算forward_propagation_n输出的成本梯度
:param parameters: -包含参数"W1","b1","W2","b2","W3","b3"的字典
:param gradients: -backward_propagation_n的输出,包含与参数相关的成本梯度
:param X: -输入数据点,维度为(输入节点数量,1)
:param Y: -标签
:param epsilon: -计算输入的微小偏移以计算近似梯度
:return: difference -近似梯度和后向传播梯度之间的差异
"""
parameters_values
, keys
= Deep_Learning
.test2_1
.gc_utils
.dictionary_to_vector
(parameters
)
grad
= Deep_Learning
.test2_1
.gc_utils
.gradients_to_vector
(gradients
)
num_parameters
= parameters_values
.shape
[0]
J_plus
= np
.zeros
((num_parameters
, 1))
J_minus
= np
.zeros
((num_parameters
, 1))
gradapprox
= np
.zeros
((num_parameters
, 1))
for i
in range(num_parameters
):
thetaplus
= np
.copy
(parameters_values
)
thetaplus
[i
][0] = thetaplus
[i
][0] + epsilon
J_plus
[i
], cache
= forward_propagation_n
(X
, Y
, Deep_Learning
.test2_1
.gc_utils
.vector_to_dictionary
(thetaplus
))
thetaminus
= np
.copy
(parameters_values
)
thetaminus
[i
][0] = thetaminus
[i
][0] - epsilon
J_minus
[i
], cache
= forward_propagation_n
(X
, Y
, Deep_Learning
.test2_1
.gc_utils
.vector_to_dictionary
(thetaminus
))
gradapprox
[i
] = (J_plus
[i
] - J_minus
[i
]) / (2 * epsilon
)
numerator
= np
.linalg
.norm
(grad
- gradapprox
)
denominator
= np
.linalg
.norm
(grad
) + np
.linalg
.norm
(gradapprox
)
difference
= numerator
/ denominator
if difference
< 1e-7:
print("高维梯度检查:梯度正常")
else:
print("高维梯度检查:梯度超出阈值")
return difference
print("=======================测试gradient_check_n====================")
np
.random
.seed
(2)
X
= np
.random
.randn
(4, 3)
Y
= np
.array
([1, 1, 0])
W1
= np
.random
.randn
(5, 4)
b1
= np
.zeros
((5, 1))
W2
= np
.random
.randn
(3, 5)
b2
= np
.zeros
((3, 1))
W3
= np
.random
.randn
(1, 3)
b3
= np
.zeros
((1, 1))
parameters
= {"W1": W1
, "b1": b1
, "W2": W2
, "b2": b2
, "W3": W3
, "b3": b3
}
cost
, cache
= forward_propagation_n
(X
, Y
, parameters
)
gradients
= backward_propagation_n
(X
, Y
, cache
)
difference
= gradient_check_n
(parameters
, gradients
, X
, Y
)
print("difference = " + str(difference
))
"""
运行结果:(由于没有具体的实验数据结果显示超出阈值,准确率和对X初始化的种子有关)
高维梯度检查:梯度超出阈值
difference = nan
"""
工具包
init_utils.py
import numpy
as np
import matplotlib
.pyplot
as plt
import sklearn
import sklearn
.datasets
def sigmoid(x
):
"""
Compute the sigmoid of x
Arguments:
x -- A scalar or numpy array of any size.
Return:
s -- sigmoid(x)
"""
s
= 1/(1+np
.exp
(-x
))
return s
def relu(x
):
"""
Compute the relu of x
Arguments:
x -- A scalar or numpy array of any size.
Return:
s -- relu(x)
"""
s
= np
.maximum
(0,x
)
return s
def compute_loss(a3
, Y
):
"""
Implement the loss function
Arguments:
a3 -- post-activation, output of forward propagation
Y -- "true" labels vector, same shape as a3
Returns:
loss - value of the loss function
"""
m
= Y
.shape
[1]
np
.seterr
(divide
='ignore', invalid
='ignore')
logprobs
= np
.multiply
(-np
.log
(a3
),Y
) + np
.multiply
(-np
.log
(1 - a3
), 1 - Y
)
loss
= 1./m
* np
.nansum
(logprobs
)
return loss
def forward_propagation(X
, parameters
):
"""
Implements the forward propagation (and computes the loss) presented in Figure 2.
Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
W1 -- weight matrix of shape ()
b1 -- bias vector of shape ()
W2 -- weight matrix of shape ()
b2 -- bias vector of shape ()
W3 -- weight matrix of shape ()
b3 -- bias vector of shape ()
Returns:
loss -- the loss function (vanilla logistic loss)
"""
W1
= parameters
["W1"]
b1
= parameters
["b1"]
W2
= parameters
["W2"]
b2
= parameters
["b2"]
W3
= parameters
["W3"]
b3
= parameters
["b3"]
z1
= np
.dot
(W1
, X
) + b1
a1
= relu
(z1
)
z2
= np
.dot
(W2
, a1
) + b2
a2
= relu
(z2
)
z3
= np
.dot
(W3
, a2
) + b3
a3
= sigmoid
(z3
)
cache
= (z1
, a1
, W1
, b1
, z2
, a2
, W2
, b2
, z3
, a3
, W3
, b3
)
return a3
, cache
def backward_propagation(X
, Y
, cache
):
"""
Implement the backward propagation presented in figure 2.
Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
cache -- cache output from forward_propagation()
Returns:
gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
"""
m
= X
.shape
[1]
(z1
, a1
, W1
, b1
, z2
, a2
, W2
, b2
, z3
, a3
, W3
, b3
) = cache
dz3
= 1./m
* (a3
- Y
)
dW3
= np
.dot
(dz3
, a2
.T
)
db3
= np
.sum(dz3
, axis
=1, keepdims
= True)
da2
= np
.dot
(W3
.T
, dz3
)
dz2
= np
.multiply
(da2
, np
.int64
(a2
> 0))
dW2
= np
.dot
(dz2
, a1
.T
)
db2
= np
.sum(dz2
, axis
=1, keepdims
= True)
da1
= np
.dot
(W2
.T
, dz2
)
dz1
= np
.multiply
(da1
, np
.int64
(a1
> 0))
dW1
= np
.dot
(dz1
, X
.T
)
db1
= np
.sum(dz1
, axis
=1, keepdims
= True)
gradients
= {"dz3": dz3
, "dW3": dW3
, "db3": db3
,
"da2": da2
, "dz2": dz2
, "dW2": dW2
, "db2": db2
,
"da1": da1
, "dz1": dz1
, "dW1": dW1
, "db1": db1
}
return gradients
def update_parameters(parameters
, grads
, learning_rate
):
"""
Update parameters using gradient descent
Arguments:
parameters -- python dictionary containing your parameters
grads -- python dictionary containing your gradients, output of n_model_backward
Returns:
parameters -- python dictionary containing your updated parameters
parameters['W' + str(i)] = ...
parameters['b' + str(i)] = ...
"""
L
= len(parameters
) // 2
for k
in range(L
):
parameters
["W" + str(k
+1)] = parameters
["W" + str(k
+1)] - learning_rate
* grads
["dW" + str(k
+1)]
parameters
["b" + str(k
+1)] = parameters
["b" + str(k
+1)] - learning_rate
* grads
["db" + str(k
+1)]
return parameters
def predict(X
, y
, parameters
):
"""
This function is used to predict the results of a n-layer neural network.
Arguments:
X -- data set of examples you would like to label
parameters -- parameters of the trained model
Returns:
p -- predictions for the given dataset X
"""
m
= X
.shape
[1]
p
= np
.zeros
((1,m
), dtype
= np
.int)
a3
, caches
= forward_propagation
(X
, parameters
)
for i
in range(0, a3
.shape
[1]):
if a3
[0,i
] > 0.5:
p
[0,i
] = 1
else:
p
[0,i
] = 0
print("Accuracy: " + str(np
.mean
((p
[0,:] == y
[0,:]))))
return p
def load_dataset(is_plot
=True):
np
.random
.seed
(1)
train_X
, train_Y
= sklearn
.datasets
.make_circles
(n_samples
=300, noise
=.05)
np
.random
.seed
(2)
test_X
, test_Y
= sklearn
.datasets
.make_circles
(n_samples
=100, noise
=.05)
if is_plot
:
plt
.scatter
(train_X
[:, 0], train_X
[:, 1], c
=train_Y
, s
=40, cmap
=plt
.cm
.Spectral
);
train_X
= train_X
.T
train_Y
= train_Y
.reshape
((1, train_Y
.shape
[0]))
test_X
= test_X
.T
test_Y
= test_Y
.reshape
((1, test_Y
.shape
[0]))
return train_X
, train_Y
, test_X
, test_Y
"""
绘图决策边界的代码还是不能深入理解
"""
def plot_decision_boundary(model
, X
, y
):
x_min
, x_max
= X
[0, :].min() - 1, X
[0, :].max() + 1
y_min
, y_max
= X
[1, :].min() - 1, X
[1, :].max() + 1
h
= 0.01
xx
, yy
= np
.meshgrid
(np
.arange
(x_min
, x_max
, h
), np
.arange
(y_min
, y_max
, h
))
Z
= model
(np
.c_
[xx
.ravel
(), yy
.ravel
()])
Z
= Z
.reshape
(xx
.shape
)
plt
.contourf
(xx
, yy
, Z
, cmap
=plt
.cm
.Spectral
)
plt
.ylabel
('x2')
plt
.xlabel
('x1')
plt
.scatter
(X
[0, :], X
[1, :], c
=y
, cmap
=plt
.cm
.Spectral
)
plt
.show
()
def predict_dec(parameters
, X
):
"""
Used for plotting decision boundary.
Arguments:
parameters -- python dictionary containing your parameters
X -- input data of size (m, K)
Returns
predictions -- vector of predictions of our model (red: 0 / blue: 1)
"""
a3
, cache
= forward_propagation
(X
, parameters
)
predictions
= (a3
>0.5)
return predictions
reg_utils.py
import numpy
as np
import matplotlib
.pyplot
as plt
import scipy
.io
as sio
def sigmoid(x
):
"""
Compute the sigmoid of x
Arguments:
x -- A scalar or numpy array of any size.
Return:
s -- sigmoid(x)
"""
s
= 1/(1+np
.exp
(-x
))
return s
def relu(x
):
"""
Compute the relu of x
Arguments:
x -- A scalar or numpy array of any size.
Return:
s -- relu(x)
"""
s
= np
.maximum
(0,x
)
return s
def initialize_parameters(layer_dims
):
"""
Arguments:
layer_dims -- python array (list) containing the dimensions of each layer in our network
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
b1 -- bias vector of shape (layer_dims[l], 1)
Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l])
bl -- bias vector of shape (1, layer_dims[l])
Tips:
- For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1].
This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it!
- In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer.
"""
np
.random
.seed
(3)
parameters
= {}
L
= len(layer_dims
)
for l
in range(1, L
):
parameters
['W' + str(l
)] = np
.random
.randn
(layer_dims
[l
], layer_dims
[l
-1]) / np
.sqrt
(layer_dims
[l
-1])
parameters
['b' + str(l
)] = np
.zeros
((layer_dims
[l
], 1))
return parameters
def forward_propagation(X
, parameters
):
"""
Implements the forward propagation (and computes the loss) presented in Figure 2.
Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
W1 -- weight matrix of shape ()
b1 -- bias vector of shape ()
W2 -- weight matrix of shape ()
b2 -- bias vector of shape ()
W3 -- weight matrix of shape ()
b3 -- bias vector of shape ()
Returns:
loss -- the loss function (vanilla logistic loss)
"""
W1
= parameters
["W1"]
b1
= parameters
["b1"]
W2
= parameters
["W2"]
b2
= parameters
["b2"]
W3
= parameters
["W3"]
b3
= parameters
["b3"]
z1
= np
.dot
(W1
, X
) + b1
a1
= relu
(z1
)
z2
= np
.dot
(W2
, a1
) + b2
a2
= relu
(z2
)
z3
= np
.dot
(W3
, a2
) + b3
a3
= sigmoid
(z3
)
cache
= (z1
, a1
, W1
, b1
, z2
, a2
, W2
, b2
, z3
, a3
, W3
, b3
)
return a3
, cache
def compute_cost(a3
, Y
):
"""
Implement the cost function
Arguments:
a3 -- post-activation, output of forward propagation
Y -- "true" labels vector, same shape as a3
Returns:
cost - value of the cost function
"""
m
= Y
.shape
[1]
logprobs
= np
.multiply
(-np
.log
(a3
),Y
) + np
.multiply
(-np
.log
(1 - a3
), 1 - Y
)
cost
= 1./m
* np
.nansum
(logprobs
)
return cost
def backward_propagation(X
, Y
, cache
):
"""
Implement the backward propagation presented in figure 2.
Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
cache -- cache output from forward_propagation()
Returns:
gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
"""
m
= X
.shape
[1]
(z1
, a1
, W1
, b1
, z2
, a2
, W2
, b2
, z3
, a3
, W3
, b3
) = cache
dz3
= 1./m
* (a3
- Y
)
dW3
= np
.dot
(dz3
, a2
.T
)
db3
= np
.sum(dz3
, axis
=1, keepdims
= True)
da2
= np
.dot
(W3
.T
, dz3
)
dz2
= np
.multiply
(da2
, np
.int64
(a2
> 0))
dW2
= np
.dot
(dz2
, a1
.T
)
db2
= np
.sum(dz2
, axis
=1, keepdims
= True)
da1
= np
.dot
(W2
.T
, dz2
)
dz1
= np
.multiply
(da1
, np
.int64
(a1
> 0))
dW1
= np
.dot
(dz1
, X
.T
)
db1
= np
.sum(dz1
, axis
=1, keepdims
= True)
gradients
= {"dz3": dz3
, "dW3": dW3
, "db3": db3
,
"da2": da2
, "dz2": dz2
, "dW2": dW2
, "db2": db2
,
"da1": da1
, "dz1": dz1
, "dW1": dW1
, "db1": db1
}
return gradients
def update_parameters(parameters
, grads
, learning_rate
):
"""
Update parameters using gradient descent
Arguments:
parameters -- python dictionary containing your parameters
grads -- python dictionary containing your gradients, output of n_model_backward
Returns:
parameters -- python dictionary containing your updated parameters
parameters['W' + str(i)] = ...
parameters['b' + str(i)] = ...
"""
L
= len(parameters
) // 2
for k
in range(L
):
parameters
["W" + str(k
+1)] = parameters
["W" + str(k
+1)] - learning_rate
* grads
["dW" + str(k
+1)]
parameters
["b" + str(k
+1)] = parameters
["b" + str(k
+1)] - learning_rate
* grads
["db" + str(k
+1)]
return parameters
def load_2D_dataset(is_plot
=True):
data
= sio
.loadmat
('datasets/data.mat')
train_X
= data
['X'].T
train_Y
= data
['y'].T
test_X
= data
['Xval'].T
test_Y
= data
['yval'].T
if is_plot
:
plt
.scatter
(train_X
[0, :], train_X
[1, :], c
=train_Y
, s
=40, cmap
=plt
.cm
.Spectral
);
return train_X
, train_Y
, test_X
, test_Y
def predict(X
, y
, parameters
):
"""
This function is used to predict the results of a n-layer neural network.
Arguments:
X -- data set of examples you would like to label
parameters -- parameters of the trained model
Returns:
p -- predictions for the given dataset X
"""
m
= X
.shape
[1]
p
= np
.zeros
((1,m
), dtype
= np
.int)
a3
, caches
= forward_propagation
(X
, parameters
)
for i
in range(0, a3
.shape
[1]):
if a3
[0,i
] > 0.5:
p
[0,i
] = 1
else:
p
[0,i
] = 0
print("Accuracy: " + str(np
.mean
((p
[0,:] == y
[0,:]))))
return p
def plot_decision_boundary(model
, X
, y
):
x_min
, x_max
= X
[0, :].min() - 1, X
[0, :].max() + 1
y_min
, y_max
= X
[1, :].min() - 1, X
[1, :].max() + 1
h
= 0.01
xx
, yy
= np
.meshgrid
(np
.arange
(x_min
, x_max
, h
), np
.arange
(y_min
, y_max
, h
))
Z
= model
(np
.c_
[xx
.ravel
(), yy
.ravel
()])
Z
= Z
.reshape
(xx
.shape
)
plt
.contourf
(xx
, yy
, Z
, cmap
=plt
.cm
.Spectral
)
plt
.ylabel
('x2')
plt
.xlabel
('x1')
plt
.scatter
(X
[0, :], X
[1, :], c
=y
, cmap
=plt
.cm
.Spectral
)
plt
.show
()
def predict_dec(parameters
, X
):
"""
Used for plotting decision boundary.
Arguments:
parameters -- python dictionary containing your parameters
X -- input data of size (m, K)
Returns
predictions -- vector of predictions of our model (red: 0 / blue: 1)
"""
a3
, cache
= forward_propagation
(X
, parameters
)
predictions
= (a3
>0.5)
return predictions
gc_utils.py
import numpy
as np
import matplotlib
.pyplot
as plt
def sigmoid(x
):
"""
Compute the sigmoid of x
Arguments:
x -- A scalar or numpy array of any size.
Return:
s -- sigmoid(x)
"""
s
= 1/(1+np
.exp
(-x
))
return s
def relu(x
):
"""
Compute the relu of x
Arguments:
x -- A scalar or numpy array of any size.
Return:
s -- relu(x)
"""
s
= np
.maximum
(0,x
)
return s
def dictionary_to_vector(parameters
):
"""
Roll all our parameters dictionary into a single vector satisfying our specific required shape.
"""
keys
= []
count
= 0
for key
in ["W1", "b1", "W2", "b2", "W3", "b3"]:
new_vector
= np
.reshape
(parameters
[key
], (-1,1))
keys
= keys
+ [key
]*new_vector
.shape
[0]
if count
== 0:
theta
= new_vector
else:
theta
= np
.concatenate
((theta
, new_vector
), axis
=0)
count
= count
+ 1
return theta
, keys
def vector_to_dictionary(theta
):
"""
Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
"""
parameters
= {}
parameters
["W1"] = theta
[:20].reshape
((5,4))
parameters
["b1"] = theta
[20:25].reshape
((5,1))
parameters
["W2"] = theta
[25:40].reshape
((3,5))
parameters
["b2"] = theta
[40:43].reshape
((3,1))
parameters
["W3"] = theta
[43:46].reshape
((1,3))
parameters
["b3"] = theta
[46:47].reshape
((1,1))
return parameters
def gradients_to_vector(gradients
):
"""
Roll all our gradients dictionary into a single vector satisfying our specific required shape.
"""
count
= 0
for key
in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]:
new_vector
= np
.reshape
(gradients
[key
], (-1,1))
if count
== 0:
theta
= new_vector
else:
theta
= np
.concatenate
((theta
, new_vector
), axis
=0)
count
= count
+ 1
return theta