【吴恩达深度学习编程作业】2.1改善深层神经网络——初始化、正则化、梯度校验

it2024-10-07 47

参考文章：改善深层神经网络-初始化、正则化、梯度校验

至今为止，数据集的加载、决策边界的曲线绘制代码不熟悉，向量与字典的相互转化的代码没细看。

代码实现功能如下：

初始化参数： 1.1：使用0来初始化参数 1.2：使用随机数来初始化参数 1.3：使用抑梯度异常初始化参数（参见视频中的梯度消失和梯度爆炸）正则化模型： 2.1：使用二范数对二分类模型正则化，尝试避免过拟合。 2.2：使用随机删除节点的方法精简模型，同样是为了尝试避免过拟合。梯度校验：对模型使用梯度校验，检测它是否在梯度下降的过程中出现误差过大的情况。

1.初始化

innitialize.py

""" 初始化参数： 1：使用0来初始化参数 2：使用随机数来初始化参数 3：使用抑梯度异常初始化参数（参见视频中的梯度消失和梯度爆炸） """ import numpy as np import matplotlib.pyplot as plt import Deep_Learning.test2_1.init_utils # 绘图初始设置 plt.rcParams['figure.figsize'] = (7.0, 4.0) # 设置图片尺寸为600*400像素 plt.rcParams['image.interpolation'] = 'nearest' # 设置插值方式：最邻近插值 plt.rcParams['image.cmap'] = 'gray' # 使用灰度输出而不是彩色输出 # 读取数据集 train_X, train_Y, test_X, test_Y = Deep_Learning.test2_1.init_utils.load_dataset(is_plot=True) plt.show() # 数据呈现蓝点和红点，我们要做的就是将二者分开 # 初始化为0 def initialize_parameters_zeros(layers_dims): """ 将模型的参数全部设置为0 :param layers_dims: -列表，模型的层数和对应每一层的节点的数量 :return: parameters -包含了所有W和b的字典(l属于[0,L]) Wl -权重矩阵，维度为(layers_dims[l], layers_dims[0]) bl -偏置向量，维度为(layers_dims[l], 1) """ parameters = {} L = len(layers_dims) # 神经网络层数 for l in range(1, L): parameters["W" + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1])) parameters["b" + str(l)] = np.zeros((layers_dims[l], 1)) # 使用断言确保格式的正确性 assert (parameters["W" + str(l)].shape) == ((layers_dims[l], layers_dims[l - 1])) assert (parameters["b" + str(l)].shape) == ((layers_dims[l], 1)) return parameters # 测试initialize_parameters_zeros print("====================测试initialize_parameters_zeros====================") parameters = initialize_parameters_zeros([3, 2, 1]) print("W1 = " + str(parameters["W1"])) print("b1 = " + str(parameters["b1"])) print("W2 = " + str(parameters["W2"])) print("b2 = " + str(parameters["b2"])) # 随机初始化：可以把参数随机赋值，每个神经元可以学习其输入的不同功能 def initialize_parameters_random(layers_dims): np.random.seed(3) # 指定随机种子 parameters = {} L = len(layers_dims) # 层数 for l in range(1, L): parameters['W' + str(l)] = np.random.rand(layers_dims[l], layers_dims[l - 1]) * 10 # 使用10倍缩放 parameters['b' + str(l)] = np.zeros((layers_dims[l], 1)) assert (parameters["W" + str(l)].shape) == ((layers_dims[l],layers_dims[l - 1])) assert (parameters["b" + str(l)].shape) == ((layers_dims[l],1)) return parameters # 测试initialize_parameters_random print("====================测试initialize_parameters_random====================") parameters = initialize_parameters_random([3, 2, 1]) print("W1 = " + str(parameters["W1"])) print("b1 = " + str(parameters["b1"])) print("W2 = " + str(parameters["W2"])) print("b2 = " + str(parameters["b2"])) # 抑梯度异常初始化 def initialize_parameters_he(layers_dims): np.random.seed(3) parameters = {} L = len(layers_dims) for l in range(1, L): parameters["W" + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1]) parameters["b" + str(l)] = np.zeros((layers_dims[l], 1)) assert (parameters["W" + str(l)].shape) == ((layers_dims[l],layers_dims[l - 1])) assert (parameters["b" + str(l)].shape) == ((layers_dims[l],1)) return parameters # 测试initialize_parameters_he print("====================测试initialize_parameters_he====================") parameters = initialize_parameters_random([2, 4, 1]) print("W1 = " + str(parameters["W1"])) print("b1 = " + str(parameters["b1"])) print("W2 = " + str(parameters["W2"])) print("b2 = " + str(parameters["b2"])) # 定义模型 def model01(X, Y, learning_rate=0.01, num_interations=15000, print_cost=True, initialization="he", is_Plot=True): """ 实现一个三层的神经网络：LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID :param X: -输入数据，维度为（2，要训练/测试的数量） :param Y: -标签，【0 | 1】，维度为（1，对应的是输入数据的标签） :param learning_rate: -学习速率 :param num_interations: -迭代次数 :param print_cost: -是否打印成本值，每1000次打印一次 :param initialization: -字符串类型，初始化的类型【"zeros" | "random | "he"】 :param is_Plot: -是否绘制梯度下降的曲线图 :return: parameters -学习后的参数 """ grads = {} costs = [] m = X.shape[1] layers_dims = [X.shape[0], 10, 5, 1] # 选择初始化参数的类型 if initialization == "zeros": parameters = initialize_parameters_zeros(layers_dims) elif initialization == "random": parameters = initialize_parameters_random(layers_dims) elif initialization == "he": parameters = initialize_parameters_he(layers_dims) else: print("错误的初始化参数！程序退出") exit # 开始学习 for i in range(0, num_interations): # 前向传播 a3, cache = Deep_Learning.test2_1.init_utils.forward_propagation(X, parameters) # 计算成本 cost = Deep_Learning.test2_1.init_utils.compute_loss(a3, Y) # 反向传播 grads = Deep_Learning.test2_1.init_utils.backward_propagation(X, Y, cache) # 更新参数 parameters = Deep_Learning.test2_1.init_utils.update_parameters(parameters, grads, learning_rate) # 记录成本 if i % 1000 == 0: costs.append(cost) # 打印成本 if print_cost: print("第" + str(i) + "次迭代，成本值为：", np.squeeze(cost)) # 学习完毕，绘制成本曲线 if is_Plot: plt.plot(costs) plt.ylabel('cost') plt.xlabel('iterations (per hundreds)') plt.title("Learning rate = " + str(learning_rate)) plt.show() # 返回学习完毕后的参数 return parameters # 使用不同的初始化参数训练模型 parameters = model01(train_X, train_Y, initialization="zeros") # parameters = model01(train_X, train_Y, initialization="random") # parameters = model01(train_X, train_Y, initialization="he") # 查看预测结果 print("训练集：") predictions_train = Deep_Learning.test2_1.init_utils.predict(train_X, train_Y, parameters) print("测试集：") predictions_test = Deep_Learning.test2_1.init_utils.predict(test_X, test_Y, parameters) # 查看预测和决策边界的细节 print("predictions_train = " + str(predictions_train)) print("predictions_test = " + str(predictions_test)) plt.title("Model01 with Zeros initialization") axes = plt.gca() # 获得当前的Axes轴对象 axes.set_xlim([-1.5, 1.5]) # 设置x轴边界 axes.set_ylim([-1.5, 1.5]) # 设置y轴边界 Deep_Learning.test2_1.init_utils.plot_decision_boundary(lambda x: Deep_Learning.test2_1.init_utils.predict_dec(parameters, x.T), train_X, train_Y) """ 1.零初始化结果显示为一条直线，学习率没有变化，模型没有学习，分类失败，零初始化导致神经网络无法打破对称性 2.随机初始化有助于打破对称，不同隐藏层单元可以学习到不同的参数，但是误差开始很高，这是由于具有较大的随机权重，若初始化参数没有很好的话会导致梯度消失、爆炸，也会减慢优化算法。 3.抑梯度异常初始化的结果很好 """

查看数据集

1.1零初始化时的运行结果

====================测试initialize_parameters_zeros==================== W1 = [[0. 0. 0.] [0. 0. 0.]] b1 = [[0.] [0.]] W2 = [[0. 0.]] b2 = [[0.]] 第0次迭代，成本值为： 0.6931471805599453 第1000次迭代，成本值为： 0.6931471805599453 第2000次迭代，成本值为： 0.6931471805599453 第3000次迭代，成本值为： 0.6931471805599453 第4000次迭代，成本值为： 0.6931471805599453 第5000次迭代，成本值为： 0.6931471805599453 第6000次迭代，成本值为： 0.6931471805599453 第7000次迭代，成本值为： 0.6931471805599453 第8000次迭代，成本值为： 0.6931471805599453 第9000次迭代，成本值为： 0.6931471805599453 第10000次迭代，成本值为： 0.6931471805599455 第11000次迭代，成本值为： 0.6931471805599453 第12000次迭代，成本值为： 0.6931471805599453 第13000次迭代，成本值为： 0.6931471805599453 第14000次迭代，成本值为： 0.6931471805599453 训练集： Accuracy: 0.5 测试集： Accuracy: 0.5 predictions_train = [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] predictions_test = [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

零初始化结果显示为一条直线，学习率没有变化，模型没有学习，分类失败，零初始化导致神经网络无法打破对称性。

1.2随机初始化时的运行结果

====================测试initialize_parameters_random==================== W1 = [[5.50797903 7.08147823 2.90904739] [5.10827605 8.92946954 8.96293089]] b1 = [[0.] [0.]] W2 = [[1.2558531 2.07242878]] b2 = [[0.]] 第0次迭代，成本值为： inf 第1000次迭代，成本值为： 0.7621259370184228 第2000次迭代，成本值为： 0.6690029827916156 第3000次迭代，成本值为： 0.6513359019864347 第4000次迭代，成本值为： 0.6470045598995088 第5000次迭代，成本值为： 0.6447240738113086 第6000次迭代，成本值为： 0.6409840974187322 第7000次迭代，成本值为： 0.6278546078886772 第8000次迭代，成本值为： 0.5964022514198947 第9000次迭代，成本值为： 0.5493267350966603 第10000次迭代，成本值为： 0.5069866933465077 第11000次迭代，成本值为： 0.4899755730226258 第12000次迭代，成本值为： 0.47892254564188913 第13000次迭代，成本值为： 0.47441470732170654 第14000次迭代，成本值为： 0.47138671562861467 训练集： Accuracy: 0.6833333333333333 测试集： Accuracy: 0.67 predictions_train = [[1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0]] predictions_test = [[1 1 1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1]]

随机初始化有助于打破对称，不同隐藏层单元可以学习到不同的参数，但是误差开始很高，这是由于具有较大的随机权重，若初始化参数没有很好的话会导致梯度消失、爆炸，也会减慢优化算法。

1.3抑梯度异常初始化运行结果

====================测试initialize_parameters_he==================== W1 = [[5.50797903 7.08147823] [2.90904739 5.10827605] [8.92946954 8.96293089] [1.2558531 2.07242878]] b1 = [[0.] [0.] [0.] [0.]] W2 = [[0.51467203 4.40809844 0.29876211 4.56833224]] b2 = [[0.]] 第0次迭代，成本值为： 0.8830537463419761 第1000次迭代，成本值为： 0.6879825919728063 第2000次迭代，成本值为： 0.6751286264523371 第3000次迭代，成本值为： 0.6526117768893807 第4000次迭代，成本值为： 0.6082958970572938 第5000次迭代，成本值为： 0.5304944491717495 第6000次迭代，成本值为： 0.4138645817071794 第7000次迭代，成本值为： 0.3117803464844441 第8000次迭代，成本值为： 0.23696215330322562 第9000次迭代，成本值为： 0.18597287209206836 第10000次迭代，成本值为： 0.15015556280371817 第11000次迭代，成本值为： 0.12325079292273552 第12000次迭代，成本值为： 0.09917746546525932 第13000次迭代，成本值为： 0.08457055954024274 第14000次迭代，成本值为： 0.07357895962677362 训练集： Accuracy: 0.9933333333333333 测试集： Accuracy: 0.96 predictions_train = [[1 0 1 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 1 0 1 1 0 1 1 0 0 0 0 1 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 1 0 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 0 1 1 1 1 0 1 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1 0]] predictions_test = [[1 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0]]

抑梯度异常初始化的结果很好

2.正则化

regularization.py

""" 正则化模型： 1：使用二范数对二分类模型正则化，尝试避免过拟合。 2：使用随机删除节点的方法精简模型，同样是为了尝试避免过拟合。我们要做的就是使用模型画一条线将红色和蓝色分界，分别使用以下三种 1.不使用正则化 2.使用正则化 2.1使用L2正则化：将lambd输入设置为非零值，不使用lambda是因为其是python的关键保留字 2.2使用随机节点删除：将keep_prob设置为小于1的值 """ import numpy as np import matplotlib.pyplot as plt import Deep_Learning.test2_1.reg_utils # 绘图初始设置 plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray' # 若数据集足够大会导致过拟合问题，为避免过拟合使用正则化模型 # 读取并绘制数据集 train_X, train_Y, test_X, test_Y = Deep_Learning.test2_1.reg_utils.load_2D_dataset(is_plot=True) plt.show() """ 我们要做的就是使用模型画一条线将红色和蓝色分界，分别使用以下三种 1.不使用正则化 2.使用正则化 2.1使用L2正则化：将lambd输入设置为非零值，不使用lambda是因为其是python的关键保留字 2.2使用随机节点删除：将keep_prob设置为小于1的值 """ def model02(X,Y,learning_rate=0.3,num_iterations=30000,print_cost=True,is_Plot=True,lambd=0,keep_prob=1): """ 实现一个三层的神经网络：LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID :param X: -输入数据，维度为（2，要训练/测试的数量） :param Y: -标签，【0（蓝色） | 1（红色）】，维度为（1，对应的是输入数据的标签） :param learning_rate: -学习速率 :param num_iterations: -迭代次数 :param print_cost: -是否打印成本值，每迭代10000次打印一次，但是每1000次记录一个成本值 :param is_Plot: -是否绘制梯度下降的曲线图 :param lambd: -正则化的超参数、实数 :param keep_prob: -随机删除节点的概率 :return: parameters -学习后的参数 """ grads = {} costs = [] m = X.shape[1] layers_dims = [X.shape[0],20,3,1] # 初始化参数 parameters = Deep_Learning.test2_1.reg_utils.initialize_parameters(layers_dims) # 开始学习 for i in range(0,num_iterations): # 前向传播 ## 是否随机删除节点 if keep_prob == 1: ### 不随机删除节点 a3,cache = Deep_Learning.test2_1.reg_utils.forward_propagation(X,parameters) elif keep_prob < 1: ### 随机删除节点 a3,cache = forward_propagation_with_dropout(X,parameters,keep_prob) else: print("keep_prob参数错误，程序退出") # 计算成本 ## 是否使用二范数 if lambd == 0: ### 不使用L2正则化 cost = Deep_Learning.test2_1.reg_utils.compute_cost(a3,Y) else: ### 使用L2正则化 cost = compute_cost_with_regularization(a3,Y,parameters,lambd) # 反向传播 ## 可以同时使用L2正则化和随机删除节点，但本次实验不同时使用 assert (lambd == 0 or keep_prob == 1) ## 两个参数的使用情况 if (lambd == 0 and keep_prob == 1): ### 不使用L2正则化和不使用随机删除节点 grads = Deep_Learning.test2_1.reg_utils.backward_propagation(X,Y,cache) elif lambd != 0: ### 使用L2正则化，不使用随机删除节点 grads = backward_propagation_with_regularization(X,Y,cache,lambd) elif keep_prob < 1: ### 使用随机删除节点，不使用L2正则化 grads = backward_propagation_with_dropout(X,Y,cache,keep_prob) # 更新参数 parameters = Deep_Learning.test2_1.reg_utils.update_parameters(parameters,grads,learning_rate) # 记录并打印成本 if i % 1000 == 0: ## 记录成本 costs.append(cost) if (print_cost and i % 10000 == 0): # 打印成本 print("第" + str(i) + "次迭代，成本值为：" + str(cost)) # 是否绘制成本曲线 if is_Plot: plt.plot(costs) plt.ylabel('cost') plt.xlabel('iterations(x1, 000)') plt.title("Learning rate = " + str(learning_rate)) plt.show() # 返回学习后的参数 return parameters # 不使用正则化 parameters = model02(train_X,train_Y) print("训练集：") predictions_train = Deep_Learning.test2_1.reg_utils.predict(train_X,train_Y,parameters) print("测试集：") predictions_test = Deep_Learning.test2_1.reg_utils.predict(test_X,test_Y,parameters) # 绘制边界曲线 plt.title("Model without regularization") axes = plt.gca() axes.set_xlim([-0.75,0.40]) axes.set_ylim([-0.75,0.65]) Deep_Learning.test2_1.reg_utils.plot_decision_boundary(lambda x: Deep_Learning.test2_1.reg_utils.predict_dec(parameters,x.T),train_X,train_Y) # 使用L2正则化 def compute_cost_with_regularization(A3,Y,parameters,lambd): """ 实现L2正则化计算成本 :param A3: -正向传播的输出结果，维度为（输出节点数量，训练/测试的数量） :param Y: -标签向量，与数据一一对应，维度为（输出节点数量，训练/测试的数量） :param parameters: -包含模型学习后的参数的字典 :param lambd: -正则化的超参数、实数 :return: cost -正则化损失的值 """ m = Y.shape[1] W1 = parameters["W1"] W2 = parameters["W2"] W3 = parameters["W3"] # 交叉熵成本 cross_entropy_cost = Deep_Learning.test2_1.reg_utils.compute_cost(A3,Y) # L2正则化项 L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) / (2 * m) cost = cross_entropy_cost + L2_regularization_cost return cost # 由于改变了成本函数，也必须改变向后传播函数，所有的梯度都必须根据这个新的成本值来计算 def backward_propagation_with_regularization(X,Y,cache,lambd): """ 实现添加了L2正则化的模型的后向传播 :param X: -输入数据集，维度为（输入节点数量，数据集里面的数量） :param Y: -标签，维度为（输出节点数量，数据集里面的数量） :param cache: -来自forward_propagation()的cache输出 :param lambd: -regularization超参数，实数 :return: gradients -包含每个参数、激活值和预激活值变量的梯度的字典 """ m = X.shape[1] (Z1,A1,W1,b1,Z2,A2,W2,b2,Z3,A3,W3,b3) = cache dZ3 = A3 - Y dW3 = (1 / m) * np.dot(dZ3,A2.T) + ((lambd * W3) / m) db3 = (1 / m) * np.sum(dZ3,axis=1,keepdims=True) dA2 = np.dot(W3.T,dZ3) dZ2 = np.multiply(dA2,np.int64(A2 > 0)) # 参见笔记3.8：若A2>0结果为1，否则为0 dW2 = (1 / m) * np.dot(dZ2,A1.T) + ((lambd * W2) / m) db2 = (1 / m) * np.sum(dZ2,axis=1,keepdims=True) dA1 = np.dot(W2.T,dZ2) dZ1 = np.multiply(dA1,np.int64(A1 > 0)) dW1 = (1 / m) * np.dot(dZ1,X.T) + ((lambd * W1) / m) db1 = (1 / m) * np.sum(dZ1,axis=1,keepdims=True) gradients = {"dZ3": dZ3,"dW3": dW3,"db3": db3,"dA2": dA2, "dZ2": dZ2,"dW2": dW2,"db2": db2,"dA1": dA1, "dZ1": dZ1,"dW1": dW1,"db1": db1} return gradients # 测试L2正则化 parameters = model02(train_X,train_Y,lambd=0.7) print("使用正则化，训练集：") predictions_train = Deep_Learning.test2_1.reg_utils.predict(train_X,train_Y,parameters) print("使用正则化，测试集：") predictions_test = Deep_Learning.test2_1.reg_utils.predict(test_X,test_Y,parameters) # 查看分类结果 plt.title("Model with L2-regularization") axes = plt.gca() axes.set_xlim([-0.75,0.40]) axes.set_ylim([-0.75,0.65]) Deep_Learning.test2_1.reg_utils.plot_decision_boundary( lambda x: Deep_Learning.test2_1.reg_utils.predict_dec(parameters,x.T),train_X,train_Y) """ lambd的值是可以使用开发集调整时的超参数，L2正则化会使决策边界更加平滑，若lambd太大，会平滑过渡模型产生高偏差 L2正则化依赖于较小权重的模型比具有较大权重的模型更简单这样的假设，因此通过削弱成本函数中权重的平方值，可以将所有权重值逐渐变小权重高的话会有更平滑的模型，其中输入变化时输出变化更慢，L2正则化对以下内容有影响 1.成本计算：正则化的计算需要添加到成本函数中 2.反向传播：在权重矩阵中，梯度计算时也要依据正则化作出相应的计算 3.重量变小（重量衰减）：权重被逐渐改变到较小的值 """ # 随机删除节点 def forward_propagation_with_dropout(X,parameters,keep_prob=0.5): """ 实现具有随机舍弃节点的前向传播 LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID :param X: -输入数据集，维度为（2，示例数） :param parameters: -包含参数"W1","b1","W2","b2","W3","b3"的字典 :param keep_prob: -随机删除的概率，实数 :return: A3 -最后的激活值，维度为（1,1),正向传播的输出 cache -存储一些用于计算反向传播的数值的元组 """ np.random.seed(1) W1 = parameters["W1"] b1 = parameters["b1"] W2 = parameters["W2"] b2 = parameters["b2"] W3 = parameters["W3"] b3 = parameters["b3"] Z1 = np.dot(W1,X) + b1 A1 = Deep_Learning.test2_1.reg_utils.relu(Z1) # DROPOUT # 1.初始化矩阵D1 = np.random.rand(...,...) D1 = np.random.rand(A1.shape[0],A1.shape[1]) # 2.将D1的值转换为0或者1，(使用keep_prob作为阈值) # 高于keep_prob的节点设置为，低于的设置为1？？？？？？？？？？？ D1 = D1 < keep_prob # 3.舍弃A1的一些节点（将其值设为0或False） A1 = A1 * D1 # 4.缩放未舍弃的节点（不为0）的值 A1 = A1 / keep_prob """ 示例代码帮助理解： import numpy as np np.random.seed(1) # ??????????rand和randn区别 A1 = np.random.randn(1, 3) print("A1:", A1) # A1: [[ 1.62434536 -0.61175641 -0.52817175]] D1 = np.random.rand(A1.shape[0], A1.shape[1]) print("D1:", D1) # D1: [[0.39676747 0.53881673 0.41919451]] keep_prob = 0.5 D1 = D1 < keep_prob print(D1) # [[ True False True]] A1 = 0.01 A1 = A1 * D1 A1 = A1 / keep_prob print(A1) # [[0.02 0. 0.02]] """ # ??????????? W.T Z2 = np.dot(W2,A1) + b2 A2 = Deep_Learning.test2_1.reg_utils.relu(Z2) D2 = np.random.rand(A2.shape[0],A2.shape[1]) D2 = D2 < keep_prob A2 = A2 * D2 A2 = A2 / keep_prob Z3 = np.dot(W3,A2) + b3 A3 = Deep_Learning.test2_1.reg_utils.sigmoid(Z3) cache = (Z1,D1,A1,W1,b1,Z2,D2,A2,W2,b2,Z3,A3,W3,b3) return A3,cache # 随机删除的模型的后向传播 def backward_propagation_with_dropout(X,Y,cache,keep_prob): """ :param X: -输入数据集，维度为（2，示例数） :param Y: -标签，维度为（输出节点数量，示例数量） :param cache: -来自forward_propagation_with_dropout()的cache输出 :param keep_prob: -随机删除的概率，实数 :return:gradients -一个关于每个参数，激活值和预激活变量的梯度值的字典 """ m = X.shape[1] (Z1,D1,A1,W1,b1,Z2,D2,A2,W2,b2,Z3,A3,W3,b3) = cache dZ3 = A3 - Y # ???????????????.T dW3 = (1 / m) * np.dot(dZ3,A2.T) db3 = (1. / m) * np.sum(dZ3,axis=1,keepdims=True) dA2 = np.dot(W3.T,dZ3) # 使用正向传播相同的节点，舍弃那些关闭的节点，（因为任何数乘以0或者False都为0或者False） dA2 = dA2 * D2 # 缩放未舍弃的节点（不为0）的值 dA2 = dA2 / keep_prob dZ2 = np.multiply(dA2,np.int64(A2 > 0)) dW2 = 1. / m * np.dot(dZ2,A1.T) db2 = 1. / m * np.sum(dZ2,axis=1,keepdims=True) dA1 = np.dot(W2.T,dZ2) dA1 = dA1 * D1 dA1 = dA1 / keep_prob dZ1 = np.multiply(dA1,np.int64(A1 > 0)) dW1 = 1. / m * np.dot(dZ1,X.T) db1 = 1. / m * np.sum(dZ1,axis=1,keepdims=True) gradients = {"dZ3": dZ3,"dW3": dW3,"db3": db3,"dA2": dA2, "dZ2": dZ2,"dW2": dW2,"db2": db2,"dA1": dA1, "dZ1": dZ1,"dW1": dW1,"db1": db1} return gradients # 测试DROPOUT parameters = model02(train_X,train_Y,keep_prob=0.86,learning_rate=0.3) print("使用随机删除节点，训练集：") predictions_train = Deep_Learning.test2_1.reg_utils.predict(train_X,train_Y,parameters) print("使用随机删除节点，测试集：") predictions_test = Deep_Learning.test2_1.reg_utils.predict(test_X,test_Y,parameters) # 绘制决策边界 plt.title("Model with dropout") axes = plt.gca() axes.set_xlim([-0.75,0.40]) axes.set_ylim([-0.75,0.65]) Deep_Learning.test2_1.reg_utils.plot_decision_boundary( lambda x: Deep_Learning.test2_1.reg_utils.predict_dec(parameters,x.T),train_X,train_Y) """ 正则化会把训练集的准确度降低，但是测试集的准确度提高了 """

查看数据集

2.1无正则化的运行结果

第0次迭代，成本值为：0.6557412523481002 第10000次迭代，成本值为：0.16329987525724216 第20000次迭代，成本值为：0.1385164242327309 训练集： Accuracy: 0.9478672985781991 测试集： Accuracy: 0.915

无正则化时，分割曲线有明显的过拟合特征。

2.2使用L2正则化后的运行结果

第0次迭代，成本值为：0.6974484493131264 第10000次迭代，成本值为：0.2684918873282239 第20000次迭代，成本值为：0.2680916337127301 使用正则化，训练集： Accuracy: 0.9383886255924171 使用正则化，测试集： Accuracy: 0.93

lambd的值是可以使用开发集调整时的超参数，L2正则化会使决策边界更加平滑，若lambd太大，会平滑过渡模型产生高偏差 L2正则化依赖于较小权重的模型比具有较大权重的模型更简单这样的假设，因此通过削弱成本函数中权重的平方值，可以将所有权重值逐渐变小，权重高的话会有更平滑的模型，其中输入变化时输出变化更慢，L2正则化对以下内容有影响：

成本计算：正则化的计算需要添加到成本函数中反向传播：在权重矩阵中，梯度计算时也要依据正则化作出相应的计算重量变小（重量衰减）：权重被逐渐改变到较小的值

2.3使用Dropout后的运行结果

第0次迭代，成本值为：0.6543912405149825 第10000次迭代，成本值为：0.0610169865749056 第20000次迭代，成本值为：0.060582435798513114 使用随机删除节点，训练集： Accuracy: 0.9289099526066351 使用随机删除节点，测试集： Accuracy: 0.95

正则化会把训练集的准确度降低，但是测试集的准确度提高了

3.梯度校验

gradient_check.py

""" 梯度校验：对模型使用梯度校验，检测它是否在梯度下降的过程中出现误差过大的情况。 """ import numpy as np import Deep_Learning.test2_1.gc_utils # 梯度检验 # 一维线性模型的梯度检查计算过程 def forward_propagation(x, theta): """ 实现线性前向传播（计算J）（J(theta) = theta * x） :param x: -一个实值输入 :param theta: -参数，也是一个实数 :return: J -函数J的值 """ J = np.dot(theta, x) return J def backward_propagation(x, theta): """ 计算J相对于theta的导数 :param x: -一个实值输入 :param theta: -参数，也是一个实数 :return: dtheta -相对于theta的成本梯度 """ dtheta = x return dtheta # 当difference<10e-7时结果正确 def gradient_check(x, theta, epsilon=1e-7): """ :param x: -一个实值输入 :param theta: -参数，也是一个实数 :param epsilon: -计算输入的微笑偏移以计算近似梯度 :return: 近似梯度和后向传播梯度之间的差异 """ thetaplus = theta + epsilon thetaminus = theta - epsilon J_plus = forward_propagation(x, thetaplus) J_minus = forward_propagation(x, thetaminus) gradapprox = (J_plus - J_minus) / (2 * epsilon) # 检查gradapprox是否足够接近backward_propagation()的输出 grad = backward_propagation(x, theta) numerator = np.linalg.norm(grad - gradapprox) # 求范数：linalg=linear（线性）+algebra（代数），norm则表示范数。 denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) difference = numerator / denominator if difference < 1e-7: print("一维线性梯度检查：梯度正常") else: print("一维线性梯度检查：梯度超出阈值") return difference # 测试gradient_check print("=================测试gradient_check====================") x, theta = 2, 4 difference = gradient_check(x, theta) print("difference = " + str(difference)) """ 运行结果：一维线性梯度检查：梯度正常 difference = 2.919335883291695e-10 """ # 高维模型的梯度检查计算过程 def forward_propagation_n(X, Y, parameters): """ :param X: -训练集为m个例子 :param Y: -m个示例的标签 :param parameters: -包含参数"W1","b1","W2","b2","W3","b3"的字典（layers_dims=[4,5,3,1]） :return: cost -成本函数 """ m = X.shape[1] W1 = parameters["W1"] b1 = parameters["b1"] W2 = parameters["W2"] b2 = parameters["b2"] W3 = parameters["W3"] b3 = parameters["b3"] # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID Z1 = np.dot(W1, X) + b1 A1 = Deep_Learning.test2_1.gc_utils.relu(Z1) Z2 = np.dot(W2, A1) + b2 A2 = Deep_Learning.test2_1.gc_utils.relu(Z2) Z3 = np.dot(W3, A2) + b3 A3 = Deep_Learning.test2_1.gc_utils.relu(Z3) # 计算成本 logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y) cost = (1 / m) * np.sum(logprobs) cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) return cost, cache def backward_propagation_n(X, Y, cache): """ :param X: -输入数据点（输入节点数，1） :param Y: -标签 :param cache: -来自forward_propagation_n()的cache输出 :return: gradients -字典：包含每个参数、激活和激活前变量相关的成本梯度 """ m = X.shape[1] (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache dZ3 = A3 - Y # 1/和1./的无区别 dW3 = (1. / m) * np.dot(dZ3, A2.T) db3 = (1. / m) * np.sum(dZ3, axis=1, keepdims=True) dA2 = np.dot(W3.T, dZ3) dZ2 = np.multiply(dA2, np.int64(A2 > 0)) dW2 = 1. / m * np.dot(dZ2, A1.T) db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True) dA1 = np.dot(W2.T, dZ2) dZ1 = np.multiply(dA1, np.int64(A1 > 0)) dW1 = 1. / m * np.dot(dZ1, X.T) db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True) gradients = {"dZ3": dZ3,"dW3": dW3,"db3": db3,"dA2": dA2, "dZ2": dZ2,"dW2": dW2,"db2": db2,"dA1": dA1, "dZ1": dZ1,"dW1": dW1,"db1": db1} return gradients def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ 检查backward_propagation_n是否正确计算forward_propagation_n输出的成本梯度 :param parameters: -包含参数"W1","b1","W2","b2","W3","b3"的字典 :param gradients: -backward_propagation_n的输出，包含与参数相关的成本梯度 :param X: -输入数据点，维度为（输入节点数量，1） :param Y: -标签 :param epsilon: -计算输入的微小偏移以计算近似梯度 :return: difference -近似梯度和后向传播梯度之间的差异 """ # 初始化参数 # 将"parameters"字典转换为一个称为"values"的向量，通过将所有参数(W1,b1,W2,b2,W3,b3)整形为向量并将它们连接起来获得 parameters_values, keys = Deep_Learning.test2_1.gc_utils.dictionary_to_vector(parameters) # keys用不到 grad = Deep_Learning.test2_1.gc_utils.gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # 计算gradapprox for i in range(num_parameters): # 计算J_plus[i] 输入="parameters_values,epsilon" 输出=”J_plus[i]“ thetaplus = np.copy(parameters_values) thetaplus[i][0] = thetaplus[i][0] + epsilon J_plus[i], cache = forward_propagation_n(X, Y, Deep_Learning.test2_1.gc_utils.vector_to_dictionary(thetaplus)) # 计算J_minus[i] 输入="parameters_values,epsilon" 输出="J_minus[i]" thetaminus = np.copy(parameters_values) thetaminus[i][0] = thetaminus[i][0] - epsilon J_minus[i], cache = forward_propagation_n(X, Y, Deep_Learning.test2_1.gc_utils.vector_to_dictionary(thetaminus)) # 计算gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) # 通过计算差异比较gradapprox和向后传播梯度 numerator = np.linalg.norm(grad - gradapprox) denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) difference = numerator / denominator if difference < 1e-7: print("高维梯度检查：梯度正常") else: print("高维梯度检查：梯度超出阈值") return difference # 测试gradient_check_n， print("=======================测试gradient_check_n====================") np.random.seed(2) X = np.random.randn(4, 3) Y = np.array([1, 1, 0]) W1 = np.random.randn(5, 4) b1 = np.zeros((5, 1)) W2 = np.random.randn(3, 5) b2 = np.zeros((3, 1)) W3 = np.random.randn(1, 3) b3 = np.zeros((1, 1)) parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2, "W3": W3, "b3": b3} cost, cache = forward_propagation_n(X, Y, parameters) gradients = backward_propagation_n(X, Y, cache) difference = gradient_check_n(parameters, gradients, X, Y) print("difference = " + str(difference)) """ 运行结果：（由于没有具体的实验数据结果显示超出阈值，准确率和对X初始化的种子有关）高维梯度检查：梯度超出阈值 difference = nan """

工具包

init_utils.py

# -*- coding: utf-8 -*- import numpy as np import matplotlib.pyplot as plt import sklearn import sklearn.datasets def sigmoid(x): """ Compute the sigmoid of x Arguments: x -- A scalar or numpy array of any size. Return: s -- sigmoid(x) """ s = 1/(1+np.exp(-x)) return s def relu(x): """ Compute the relu of x Arguments: x -- A scalar or numpy array of any size. Return: s -- relu(x) """ s = np.maximum(0,x) return s def compute_loss(a3, Y): """ Implement the loss function Arguments: a3 -- post-activation, output of forward propagation Y -- "true" labels vector, same shape as a3 Returns: loss - value of the loss function """ m = Y.shape[1] np.seterr(divide='ignore', invalid='ignore') # 忽略除以0的情况，消除警告 logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y) loss = 1./m * np.nansum(logprobs) return loss def forward_propagation(X, parameters): """ Implements the forward propagation (and computes the loss) presented in Figure 2. Arguments: X -- input dataset, of shape (input size, number of examples) Y -- true "label" vector (containing 0 if cat, 1 if non-cat) parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": W1 -- weight matrix of shape () b1 -- bias vector of shape () W2 -- weight matrix of shape () b2 -- bias vector of shape () W3 -- weight matrix of shape () b3 -- bias vector of shape () Returns: loss -- the loss function (vanilla logistic loss) """ # retrieve parameters W1 = parameters["W1"] b1 = parameters["b1"] W2 = parameters["W2"] b2 = parameters["b2"] W3 = parameters["W3"] b3 = parameters["b3"] # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID z1 = np.dot(W1, X) + b1 a1 = relu(z1) z2 = np.dot(W2, a1) + b2 a2 = relu(z2) z3 = np.dot(W3, a2) + b3 a3 = sigmoid(z3) cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) return a3, cache def backward_propagation(X, Y, cache): """ Implement the backward propagation presented in figure 2. Arguments: X -- input dataset, of shape (input size, number of examples) Y -- true "label" vector (containing 0 if cat, 1 if non-cat) cache -- cache output from forward_propagation() Returns: gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables """ m = X.shape[1] (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache dz3 = 1./m * (a3 - Y) dW3 = np.dot(dz3, a2.T) db3 = np.sum(dz3, axis=1, keepdims = True) da2 = np.dot(W3.T, dz3) dz2 = np.multiply(da2, np.int64(a2 > 0)) dW2 = np.dot(dz2, a1.T) db2 = np.sum(dz2, axis=1, keepdims = True) da1 = np.dot(W2.T, dz2) dz1 = np.multiply(da1, np.int64(a1 > 0)) dW1 = np.dot(dz1, X.T) db1 = np.sum(dz1, axis=1, keepdims = True) gradients = {"dz3": dz3, "dW3": dW3, "db3": db3, "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2, "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1} return gradients def update_parameters(parameters, grads, learning_rate): """ Update parameters using gradient descent Arguments: parameters -- python dictionary containing your parameters grads -- python dictionary containing your gradients, output of n_model_backward Returns: parameters -- python dictionary containing your updated parameters parameters['W' + str(i)] = ... parameters['b' + str(i)] = ... """ L = len(parameters) // 2 # number of layers in the neural networks # Update rule for each parameter for k in range(L): parameters["W" + str(k+1)] = parameters["W" + str(k+1)] - learning_rate * grads["dW" + str(k+1)] parameters["b" + str(k+1)] = parameters["b" + str(k+1)] - learning_rate * grads["db" + str(k+1)] return parameters def predict(X, y, parameters): """ This function is used to predict the results of a n-layer neural network. Arguments: X -- data set of examples you would like to label parameters -- parameters of the trained model Returns: p -- predictions for the given dataset X """ m = X.shape[1] p = np.zeros((1,m), dtype = np.int) # Forward propagation a3, caches = forward_propagation(X, parameters) # convert probas to 0/1 predictions for i in range(0, a3.shape[1]): if a3[0,i] > 0.5: p[0,i] = 1 else: p[0,i] = 0 # print results print("Accuracy: " + str(np.mean((p[0,:] == y[0,:])))) return p def load_dataset(is_plot=True): np.random.seed(1) train_X, train_Y = sklearn.datasets.make_circles(n_samples=300, noise=.05) np.random.seed(2) test_X, test_Y = sklearn.datasets.make_circles(n_samples=100, noise=.05) # Visualize the data if is_plot: plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral); train_X = train_X.T train_Y = train_Y.reshape((1, train_Y.shape[0])) test_X = test_X.T test_Y = test_Y.reshape((1, test_Y.shape[0])) return train_X, train_Y, test_X, test_Y """ 绘图决策边界的代码还是不能深入理解 """ def plot_decision_boundary(model, X, y): # Set min and max values and give it some padding x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1 y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1 h = 0.01 # Generate a grid of points with distance h between them xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Predict the function value for the whole grid Z = model(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # Plot the contour and training examples plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) plt.ylabel('x2') plt.xlabel('x1') plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral) plt.show() def predict_dec(parameters, X): """ Used for plotting decision boundary. Arguments: parameters -- python dictionary containing your parameters X -- input data of size (m, K) Returns predictions -- vector of predictions of our model (red: 0 / blue: 1) """ # Predict using forward propagation and a classification threshold of 0.5 a3, cache = forward_propagation(X, parameters) predictions = (a3>0.5) return predictions

reg_utils.py

# -*- coding: utf-8 -*- import numpy as np import matplotlib.pyplot as plt import scipy.io as sio def sigmoid(x): """ Compute the sigmoid of x Arguments: x -- A scalar or numpy array of any size. Return: s -- sigmoid(x) """ s = 1/(1+np.exp(-x)) return s def relu(x): """ Compute the relu of x Arguments: x -- A scalar or numpy array of any size. Return: s -- relu(x) """ s = np.maximum(0,x) return s def initialize_parameters(layer_dims): """ Arguments: layer_dims -- python array (list) containing the dimensions of each layer in our network Returns: parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL": W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1]) b1 -- bias vector of shape (layer_dims[l], 1) Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l]) bl -- bias vector of shape (1, layer_dims[l]) Tips: - For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1]. This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it! - In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer. """ np.random.seed(3) parameters = {} L = len(layer_dims) # number of layers in the network for l in range(1, L): parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) parameters['b' + str(l)] = np.zeros((layer_dims[l], 1)) # yntaxWarning: assertion is always true, perhaps remove parentheses? # assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1]) # assert(parameters['W' + str(l)].shape == layer_dims[l], 1) return parameters def forward_propagation(X, parameters): """ Implements the forward propagation (and computes the loss) presented in Figure 2. Arguments: X -- input dataset, of shape (input size, number of examples) Y -- true "label" vector (containing 0 if cat, 1 if non-cat) parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": W1 -- weight matrix of shape () b1 -- bias vector of shape () W2 -- weight matrix of shape () b2 -- bias vector of shape () W3 -- weight matrix of shape () b3 -- bias vector of shape () Returns: loss -- the loss function (vanilla logistic loss) """ # retrieve parameters W1 = parameters["W1"] b1 = parameters["b1"] W2 = parameters["W2"] b2 = parameters["b2"] W3 = parameters["W3"] b3 = parameters["b3"] # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID z1 = np.dot(W1, X) + b1 a1 = relu(z1) z2 = np.dot(W2, a1) + b2 a2 = relu(z2) z3 = np.dot(W3, a2) + b3 a3 = sigmoid(z3) cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) return a3, cache def compute_cost(a3, Y): """ Implement the cost function Arguments: a3 -- post-activation, output of forward propagation Y -- "true" labels vector, same shape as a3 Returns: cost - value of the cost function """ m = Y.shape[1] logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y) cost = 1./m * np.nansum(logprobs) return cost def backward_propagation(X, Y, cache): """ Implement the backward propagation presented in figure 2. Arguments: X -- input dataset, of shape (input size, number of examples) Y -- true "label" vector (containing 0 if cat, 1 if non-cat) cache -- cache output from forward_propagation() Returns: gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables """ m = X.shape[1] (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache dz3 = 1./m * (a3 - Y) dW3 = np.dot(dz3, a2.T) db3 = np.sum(dz3, axis=1, keepdims = True) da2 = np.dot(W3.T, dz3) dz2 = np.multiply(da2, np.int64(a2 > 0)) dW2 = np.dot(dz2, a1.T) db2 = np.sum(dz2, axis=1, keepdims = True) da1 = np.dot(W2.T, dz2) dz1 = np.multiply(da1, np.int64(a1 > 0)) dW1 = np.dot(dz1, X.T) db1 = np.sum(dz1, axis=1, keepdims = True) gradients = {"dz3": dz3, "dW3": dW3, "db3": db3, "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2, "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1} return gradients def update_parameters(parameters, grads, learning_rate): """ Update parameters using gradient descent Arguments: parameters -- python dictionary containing your parameters grads -- python dictionary containing your gradients, output of n_model_backward Returns: parameters -- python dictionary containing your updated parameters parameters['W' + str(i)] = ... parameters['b' + str(i)] = ... """ L = len(parameters) // 2 # number of layers in the neural networks # Update rule for each parameter for k in range(L): parameters["W" + str(k+1)] = parameters["W" + str(k+1)] - learning_rate * grads["dW" + str(k+1)] parameters["b" + str(k+1)] = parameters["b" + str(k+1)] - learning_rate * grads["db" + str(k+1)] return parameters def load_2D_dataset(is_plot=True): data = sio.loadmat('datasets/data.mat') train_X = data['X'].T train_Y = data['y'].T test_X = data['Xval'].T test_Y = data['yval'].T if is_plot: plt.scatter(train_X[0, :], train_X[1, :], c=train_Y, s=40, cmap=plt.cm.Spectral); return train_X, train_Y, test_X, test_Y def predict(X, y, parameters): """ This function is used to predict the results of a n-layer neural network. Arguments: X -- data set of examples you would like to label parameters -- parameters of the trained model Returns: p -- predictions for the given dataset X """ m = X.shape[1] p = np.zeros((1,m), dtype = np.int) # Forward propagation a3, caches = forward_propagation(X, parameters) # convert probas to 0/1 predictions for i in range(0, a3.shape[1]): if a3[0,i] > 0.5: p[0,i] = 1 else: p[0,i] = 0 # print results print("Accuracy: " + str(np.mean((p[0,:] == y[0,:])))) return p def plot_decision_boundary(model, X, y): # Set min and max values and give it some padding x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1 y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1 h = 0.01 # Generate a grid of points with distance h between them xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Predict the function value for the whole grid Z = model(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # Plot the contour and training examples plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) plt.ylabel('x2') plt.xlabel('x1') plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral) plt.show() def predict_dec(parameters, X): """ Used for plotting decision boundary. Arguments: parameters -- python dictionary containing your parameters X -- input data of size (m, K) Returns predictions -- vector of predictions of our model (red: 0 / blue: 1) """ # Predict using forward propagation and a classification threshold of 0.5 a3, cache = forward_propagation(X, parameters) predictions = (a3>0.5) return predictions

gc_utils.py

# -*- coding: utf-8 -*- import numpy as np import matplotlib.pyplot as plt def sigmoid(x): """ Compute the sigmoid of x Arguments: x -- A scalar or numpy array of any size. Return: s -- sigmoid(x) """ s = 1/(1+np.exp(-x)) return s def relu(x): """ Compute the relu of x Arguments: x -- A scalar or numpy array of any size. Return: s -- relu(x) """ s = np.maximum(0,x) return s def dictionary_to_vector(parameters): """ Roll all our parameters dictionary into a single vector satisfying our specific required shape. """ keys = [] count = 0 for key in ["W1", "b1", "W2", "b2", "W3", "b3"]: # flatten parameter new_vector = np.reshape(parameters[key], (-1,1)) keys = keys + [key]*new_vector.shape[0] if count == 0: theta = new_vector else: theta = np.concatenate((theta, new_vector), axis=0) count = count + 1 return theta, keys def vector_to_dictionary(theta): """ Unroll all our parameters dictionary from a single vector satisfying our specific required shape. """ parameters = {} parameters["W1"] = theta[:20].reshape((5,4)) parameters["b1"] = theta[20:25].reshape((5,1)) parameters["W2"] = theta[25:40].reshape((3,5)) parameters["b2"] = theta[40:43].reshape((3,1)) parameters["W3"] = theta[43:46].reshape((1,3)) parameters["b3"] = theta[46:47].reshape((1,1)) return parameters def gradients_to_vector(gradients): """ Roll all our gradients dictionary into a single vector satisfying our specific required shape. """ count = 0 for key in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]: # flatten parameter new_vector = np.reshape(gradients[key], (-1,1)) if count == 0: theta = new_vector else: theta = np.concatenate((theta, new_vector), axis=0) count = count + 1 return theta

最新回复(0)