吴恩达Machine-Learning 第五周:神经网络反向传播(Neural Network back propagation)

本节使用负反馈方式训练神经网络。

代价函数和梯度下降函数均正则化。

.

折腾了半天,可算这周的课程和作业都学完了。。特别是自己实现的部分,踩了好多跟着源码来实现遇不到的坑。。。

画出的隐藏层数据图如下

源代码加注释加理解版:

准确率基本在90~·95这附近。

# -*- coding:utf-8 -*-
import matplotlib.pyplot as plt
import numpy
import numpy as np
import scipy.io as sio
import matplotlib
import scipy.optimize as opt
from sklearn.metrics import classification_report
def load_data(path, transpose=True):
    """
    读取数据
    :param path:
    :param transpose:
    :return:
    """
    data = sio.loadmat(path)
    y = data.get('y')
    # 将y由列向量变成行向量
    y = y.reshape(y.shape[0])
    X = data.get('X')  # 5000*400
    if transpose:
        # 转置当前数据中,可以理解为将图片旋转,将数据分为5000个20*20的矩阵
        X = numpy.array([im.reshape((20, 20)).T for im in X])
        # 将旋转后的图片展开成一行 5000*400
        X = numpy.array([im.reshape(400) for im in X])
    return X, y
X, _ = load_data('./data/ex4data1.mat', transpose=False)
def plot_100_image(X):
    """
    #绘图函数,画100张图片
    X :
    """
    # 获得图片大小(根号下400)
    size = int(numpy.sqrt(X.shape[1]))
    # 随机从X中选择100组数据
    sample_idx = numpy.random.choice(numpy.arange(X.shape[0]), 100)
    # 取数随机数据
    sample_images = X[sample_idx, :]
    fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))
    for r in range(10):
        for c in range(10):
            ax_array[r, c].matshow(sample_images[10 * r + c].reshape((size, size)),
                                   cmap=matplotlib.cm.binary)
            plt.xticks(numpy.array([]))
            plt.yticks(numpy.array([]))
# 随机绘制100张图片
# plot_100_image(X)
# plt.show()
"""
前馈神经网络算法
"""
X_raw, y_raw = load_data('./data/ex4data1.mat', transpose=False)
# 增加全部为0的一列  (5000, 401)
X = numpy.insert(X_raw, 0, numpy.ones(X_raw.shape[0]), axis=1)
def expand_y(y):
    """
    处理结果向量
    将1 2 这种结果转化为 0 1 0 0 0 0 0  这种向量
    :param y:
    :return:
    """
    res = []
    for i in y:
        y_array = numpy.zeros(10)
        y_array[i - 1] = 1
        res.append(y_array)
    return numpy.array(res)
# 获得结果集处理后的目标向量
y = expand_y(y_raw)
def load_weight(path):
    """
    读取权重
    :param path:
    :return:
    """
    data = sio.loadmat(path)
    return data['Theta1'], data['Theta2']
t1, t2 = load_weight('./data/ex4weights.mat')
def serialize(a, b):
    """
    当我们使用高级优化方法来优化神经网络时,我们需要将多个参数矩阵展开,才能传入优化函数,然后再恢复形状。
    序列化2矩阵
    这个方法的目的就是使参数扁平化
    # 在这个nn架构中,我们有theta1(25,401),theta2(10,26),它们的梯度是delta1,delta2
    :param a:
    :param b:
    :return:
    """
    # concatenate 方法可以完成数组拼接
    # ravel 方法可以将数据扁平化
    # 这个有个类似方法叫 flatten 该方法也可以做数据扁平化,和raval的区别是ravel返回的是引用,对ravel后的
    # 对象进行修改会影响到原数据,而flatten则会先copy 不会对原数据产生影响。
    return numpy.concatenate((numpy.ravel(a), numpy.ravel(b)))
# 扁平化参数,25*401+10*26=10285
theta = serialize(t1, t2)
def feed_forward(theta, X):
    """apply to architecture 400+1 * 25+1 *10
    X: 5000 * 401
    """
    t1, t2 = deserialize(theta)  # t1: (25,401) t2: (10,26)
    m = X.shape[0]
    a1 = X  # 5000 * 401
    z2 = a1 @ t1.T  # 5000 * 25
    a2 = numpy.insert(sigmoid(z2), 0, numpy.ones(m), axis=1)  # 5000*26
    z3 = a2 @ t2.T  # 5000 * 10
    h = sigmoid(z3)  # 5000*10, this is h_theta(X)
    return a1, z2, a2, z3, h  # you need all those for backprop
def sigmoid(z):
    return 1 / (1 + numpy.exp(-z))
def deserialize(seq):
    """
    将扁平化处理之后的数据恢复到之前的样子(两个theta
    :param seq:
    :return:
    """
    #     """into ndarray of (25, 401), (10, 26)"""
    return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)
# h是计算后的结果集
# _, _, _, _, h = feed_forward(theta, X)
# print(h)
def cost(theta, X, y):
    """
    代价函数
    :param theta:
    :param X:
    :param y:
    :return:
    """
    m = X.shape[0]  # get the data size m
    _, _, _, _, h = feed_forward(theta, X)
    # h是已经激活之后的结果集 所以这里h不需要进行激活
    pair_computation = -numpy.multiply(y, numpy.log(h)) - numpy.multiply((1 - y), numpy.log(1 - h))
    return pair_computation.sum() / m
# cost(theta, X, y)
# 0.287629165161
# print(cost(theta, X, y))
"""
负反馈神经网络
"""
def regularized_cost(theta, X, y, l=1):
    """
    正则化代价函数
    :param theta:
    :param X:
    :param y:
    :param l:
    :return:
    """
    t1, t2 = deserialize(theta)  # t1: (25,401) t2: (10,26)
    m = X.shape[0]
    # 忽略第一项,计算t1和t2的代价
    reg_t1 = (l / (2 * m)) * numpy.power(t1[:, 1:], 2).sum()
    reg_t2 = (l / (2 * m)) * numpy.power(t2[:, 1:], 2).sum()
    cost1 = cost(theta, X, y)
    a1 = cost(theta, X, y) + reg_t1 + reg_t2
    return cost(theta, X, y) + reg_t1 + reg_t2
# 0.383769859091
# print(regularized_cost(theta, X, y))
def sigmoid_gradient(z):
    """
    sigmoid的导函数,梯度下降时使用
    """
    return numpy.multiply(sigmoid(z), 1 - sigmoid(z))
# 0.25
# print(sigmoid_gradient(0))
def gradient(theta, X, y):
    """
    梯度下降
    :param theta:
    :param X:
    :param y:
    :return:
    """
    # 将扁平化的数据拆分
    t1, t2 = deserialize(theta)  # t1: (25,401) t2: (10,26)
    m = X.shape[0]
    # 初始化两个delta
    delta1 = numpy.zeros(t1.shape)  # (25, 401)
    delta2 = numpy.zeros(t2.shape)  # (10, 26)
    # 分别是输入层  隐藏层激活前 隐藏层激活后  输出层激活前 输出层激活后
    a1, z2, a2, z3, h = feed_forward(theta, X)
    # 对每一组数据进行遍历
    for i in range(m):
        a1i = a1[i, :]  # (1, 401)
        z2i = z2[i, :]  # (1, 25)
        a2i = a2[i, :]  # (1, 26)
        hi = h[i, :]  # (1, 10)
        yi = y[i, :]  # (1, 10)
        # 输出层误差
        d3i = hi - yi  # (1, 10)
        z2i = numpy.insert(z2i, 0, numpy.ones(1))  # make it (1, 26) to compute d2i
        # 隐藏层误差
        # d2i = numpy.multiply(t2.T @ d3i, sigmoid_gradient(z2i))  # (1, 26)
        # 和上面等价
        #   注意矩阵乘法和矩阵点乘的区别   1,26 * 1,26 = 1,26
        d2i = (t2.T @ d3i) * sigmoid_gradient(z2i)
        # careful with np vector transpose
        # 参数负反馈,注意矩阵转置
        delta2 += numpy.matrix(d3i).T @ numpy.matrix(a2i)  # (1, 10).T @ (1, 26) -> (10, 26)
        delta1 += numpy.matrix(d2i[1:]).T @ numpy.matrix(a1i)  # (1, 25).T @ (1, 401) -> (25, 401)
    delta1 = delta1 / m
    delta2 = delta2 / m
    return serialize(delta1, delta2)
# d1, d2 = deserialize(gradient(theta, X, y))
def regularized_gradient(theta, X, y, l=1):
    """
    正则化梯度下降
    don't regularize theta of bias terms
    不正则theta的偏置
    """
    m = X.shape[0]
    # 获取到负反馈之后计算到的两个网络参数
    # 这里delta  为经过梯度下降计算后的参数
    # t 为经过计算前的参数  然后用delta和t计算误差项
    delta1, delta2 = deserialize(gradient(theta, X, y))
    t1, t2 = deserialize(theta)
    # 不惩罚第一项:将最前面添加的一列置0,不加入计算(0*n)
    # 这里对t1、reg term d1 的操作我理解是在做正则化,加上一个原数据的比例,防止数据差距过大
    # 但是这里去掉加上reg term ,正确率不但没有下降,甚至觉得小涨...
    # 可能这个正则化也不是每个都需要的吧
    t1[:, 0] = 0
    reg_term_d1 = (l / m) * t1
    # 修改delta参数
    delta1 = delta1 + reg_term_d1
    t2[:, 0] = 0
    reg_term_d2 = (l / m) * t2
    delta2 = delta2 + reg_term_d2
    delta1_sum = numpy.sum(delta1)
    delta2_sum = numpy.sum(delta2)
    a1 = np.sum(serialize(delta1, delta2))
    return serialize(delta1, delta2)
# 这个运行很慢,谨慎运行
# gradient_checking(theta, X, y, epsilon=0.0001, regularized=True)
def random_init(size):
    """
    随机初始化
    :param size:
    :return:
    """
    return np.random.uniform(-0.12, 0.12, size)
def nn_training(X, y):
    """regularized version
    the architecture is hard coded here... won't generalize
    网络训练
    正则版本
    """
    # 初始化参数
    init_theta = random_init(10285)  # 25*401 + 10*26
    # init_theta = np.zeros(10285)
    # 使用scipy计算代价最小的点
    res = opt.minimize(fun=regularized_cost,
                       x0=init_theta,
                       args=(X, y, 1),
                       jac=regularized_gradient,
                       method='TNC')
    return res
res = nn_training(X, y)  # 慢
print(res)
_, y_answer = load_data('./data/ex4data1.mat')
print(y_answer[:20])
final_theta = res.x
def show_accuracy(theta, X, y):
    """
    输出准确率
    :param theta:
    :param X:
    :param y:
    :return:
    """
    _, _, _, _, h = feed_forward(theta, X)
    y_pred = np.argmax(h, axis=1) + 1
    print(classification_report(y, y_pred))
"""
             precision    recall  f1-score   support
           1       0.97      0.99      0.98       500
           2       0.99      0.94      0.96       500
           3       0.90      0.98      0.94       500
           4       1.00      0.68      0.81       500
           5       1.00      0.46      0.63       500
           6       0.90      0.99      0.94       500
           7       1.00      0.78      0.88       500
           8       0.78      1.00      0.88       500
           9       0.63      0.99      0.77       500
          10       0.94      1.00      0.97       500
    accuracy                           0.88      5000
   macro avg       0.91      0.88      0.88      5000
weighted avg       0.91      0.88      0.88      5000
              precision    recall  f1-score   support
           1       0.98      0.97      0.97       500
           2       0.90      0.99      0.94       500
           3       0.95      0.94      0.94       500
           4       0.98      0.96      0.97       500
           5       1.00      0.81      0.89       500
           6       0.99      0.96      0.98       500
           7       0.98      0.94      0.96       500
           8       0.79      0.99      0.88       500
           9       0.98      0.91      0.94       500
          10       0.97      0.99      0.98       500
    accuracy                           0.95      5000
   macro avg       0.95      0.95      0.95      5000
weighted avg       0.95      0.95      0.95      5000
"""
show_accuracy(final_theta, X, y_raw)
def plot_hidden_layer(theta):
    """
    绘制隐藏层
    theta: (10285, )
    """
    final_theta1, _ = deserialize(theta)
    hidden_layer = final_theta1[:, 1:]  # ger rid of bias term theta
    fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(5, 5))
    for r in range(5):
        for c in range(5):
            ax_array[r, c].matshow(hidden_layer[5 * r + c].reshape((20, 20)),
                                   cmap=matplotlib.cm.binary)
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))
plot_hidden_layer(final_theta)
plt.show()

整理了一下代码自己又重新打了一遍:

这里需要注意的就是在正则化的时候的参数l。。好多次都打成了1。

还有就是计算长度不能用参数t。。要用X,错了好几个地方

自己实现的代码如下:

# -*- coding:utf-8 -*-
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy.optimize as opt
from sklearn.metrics import classification_report
"""
相关方法
"""
def load_data(path):
    """
    读取数据
    :param path:
    :param transpose:
    :return:
    """
    data = sio.loadmat(path)
    y = data.get('y')
    # 将y由列向量变成行向量
    y = y.reshape(y.shape[0])
    X = data.get('X')  # 5000*400
    return X, y
def sigmoid(Z):
    """
    sigmoid 函数
    :param Z:
    :return:
    """
    return 1 / (1 + np.exp(-Z))
def sigmoid_derivative(Z):
    """
    sigmoid 的导函数
    :param Z:
    :return:
    """
    # return sigmoid(Z) * (1 - sigmoid(Z))
    return np.multiply(sigmoid(Z), 1 - sigmoid(Z))
def serialize(theta1, theta2):
    """
    扁平化参数
    :param theta1:
    :param theta2:
    :return:
    """
    return np.concatenate((np.ravel(theta1), np.ravel(theta2)))
def deserialize(theta):
    """
    反扁平化参数
    :param theta:
    :return:
    """
    return theta[:25 * 401].reshape(25, 401), theta[25 * 401:].reshape(10, 26)
def feed_forward(theta, X):
    """
    神经网络计算
    :param theta:
    :param X:
    :return:
    """
    # 反扁平化获取当前参数
    t1, t2 = deserialize(theta)
    # 计算隐藏层激活前数据
    hiden_layar_init_data = X @ t1.T
    # 激活隐藏层函数,并增加一列偏置
    hiden_layar_final_data = np.insert(sigmoid(hiden_layar_init_data), 0, np.ones(X.shape[0]), axis=1)
    # 计算输出层激活前数据
    output_layar_init_data = hiden_layar_final_data @ t2.T
    # 激活输出层数据
    output_layar_final_data = sigmoid(output_layar_init_data)
    # 分别返回这四个数据
    return hiden_layar_init_data, hiden_layar_final_data, output_layar_init_data, output_layar_final_data
def gradient(theta, X, y, l=1):
    """
    梯度下降函数
    :return:
    """
    # 反序列化theta
    t1, t2 = deserialize(theta)
    # 初始化两个个t1,t2一样大的变量,用来存放需要变化的量以进行负反馈
    delta1 = np.zeros(t1.shape)
    delta2 = np.zeros(t2.shape)
    # 获取神经网络计算结果
    hiden_layar_init_data, hiden_layar_final_data, \
    output_layar_init_data, output_layar_final_data = feed_forward(theta, X)
    data_length = X.shape[0]
    for i in range(data_length):
        # 计算输出层误差
        output_layar_final_data_error = output_layar_final_data[i] - y[i]
        # 对隐藏层处理前的数据添加偏置列
        # hiden_layar_init_data_with_bias = np.insert(hiden_layar_init_data[i], 0, 1)
        hiden_layar_init_data_with_bias = np.insert(hiden_layar_init_data[i], 0, np.ones(1))
        # 计算隐藏层的误差
        hiden_layar_final_data_error = (t2.T @ output_layar_final_data_error) \
                                       * sigmoid_derivative(hiden_layar_init_data_with_bias)
        # 对delta进行反馈计算
        # delta1 += output_layar_final_data_error.T @ hiden_layar_final_data[i]
        delta2 += np.matrix(output_layar_final_data_error).T @ np.matrix(hiden_layar_final_data[i])
        delta1 += np.matrix(hiden_layar_final_data_error[1:]).T @ np.matrix(X[i])
    delta1 = delta1 / data_length
    delta2 = delta2 / data_length
    return serialize(delta1, delta2)
def regularized_gradient(theta, X, y, l=1):
    """
    正则化梯度下降函数
    :param theta:
    :param X:
    :param y:
    :param l:
    :return:
    """
    # 原始参数去扁平化
    t1, t2 = deserialize(theta)
    # 进行下降之后的参数,这里为了防止delta的参数差距过大,需要依据原数据(t1 t2)对参数进行正则化
    delta1, delta2 = deserialize(gradient(theta, X, y))
    # 数据个数
    data_length = X.shape[0]
    # 不惩罚第一项 所以t1、t2的第一项设为0
    t1[:, 0] = 0
    t2[:, 0] = 0
    reg_t1 = (l / data_length) * t1
    reg_t2 = (l / data_length) * t2
    delta1 = delta1 + reg_t1
    delta2 = delta2 + reg_t2
    delta1_sum = np.sum(delta1)
    delta2_sum = np.sum(delta2)
    a2 = np.sum(serialize(delta1, delta2))
    return serialize(delta1, delta2)
def cost(theta, X, y, l=1):
    """
    代价计算函数
    :param theta:
    :param X:
    :param y:
    :param l:
    :return:
    """
    data_length = X.shape[0]
    _, _, _, output_layar_final_data = feed_forward(theta, X)
    pair_computation = -np.multiply(y, np.log(output_layar_final_data)) - np.multiply((1 - y), np.log(
        1 - output_layar_final_data))
    return pair_computation.sum() / data_length
    # return np.sum(
    #     -(y * np.log(output_layar_final_data)) - ((1 - y) * np.log(1 - output_layar_final_data))) / data_length
def expand_y(y):
    """
    处理结果向量
    将1 2 这种结果转化为 0 1 0 0 0 0 0  这种向量
    :param y:
    :return:
    """
    res = []
    for i in y:
        y_array = np.zeros(10)
        y_array[i - 1] = 1
        res.append(y_array)
    return np.array(res)
def regularized_cost(theta, X, y, l=1):
    """
    正则化代价计算函数
    :param theta:
    :param X:
    :param y:
    :param l:
    :return:
    """
    t1, t2 = deserialize(theta)
    data_length = X.shape[0]
    # 不惩罚第一项,将第一列去掉
    reg_t1 = (l / (2 * data_length)) * np.power(t1[:, 1:], 2).sum()
    reg_t2 = (l / (2 * data_length)) * np.power(t2[:, 1:], 2).sum()
    cost1 = cost(theta, X, y)
    a2 = cost(theta, X, y, l) + reg_t1 + reg_t2
    return cost(theta, X, y, l) + reg_t1 + reg_t2
def show_accuracy(theta, X, y):
    """
    输出准确率
    :param theta:
    :param X:
    :param y:
    :return:
    """
    _, _, _, h = feed_forward(theta, X)
    y_pred = np.argmax(h, axis=1) + 1
    print(classification_report(y, y_pred))
def random_init(size):
    """
    随机初始化
    :param size:
    :return:
    """
    return np.random.uniform(-0.12, 0.12, size)
def network_training(X, y):
    """
    神经网络训练
    :param X:
    :param y:
    :return:
    """
    init_theta = random_init(401 * 25 + 26 * 10)
    # init_theta = np.zeros(10285)
    res = opt.minimize(fun=regularized_cost,
                       x0=init_theta,
                       args=(X, y, 1),
                       jac=regularized_gradient,
                       method='TNC')
    return res
def plot_hidden_layer(theta):
    """
    绘制隐藏层
    theta: (10285, )
    """
    final_theta1, _ = deserialize(theta)
    hidden_layer = final_theta1[:, 1:]  # ger rid of bias term theta
    fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(5, 5))
    for r in range(5):
        for c in range(5):
            ax_array[r, c].matshow(hidden_layer[5 * r + c].reshape((20, 20)),
                                   cmap=matplotlib.cm.binary)
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))
if __name__ == '__main__':
    X, y = load_data('./data/ex4data1.mat')
    y_raw = expand_y(y)
    X = np.insert(X, 0, np.ones(X.shape[0]), axis=1)
    res = network_training(X, y_raw)
    print(res)
    show_accuracy(res.x, X, y)
    plot_hidden_layer(res.x)
    plt.show()

0 条评论

发表回复

Avatar placeholder

您的邮箱地址不会被公开。 必填项已用 * 标注