吴恩达Machine-Learning 第五周:神经网络反向传播(Neural Network back propagation)
本节使用负反馈方式训练神经网络。
代价函数和梯度下降函数均正则化。
.
折腾了半天,可算这周的课程和作业都学完了。。特别是自己实现的部分,踩了好多跟着源码来实现遇不到的坑。。。
画出的隐藏层数据图如下

源代码加注释加理解版:
准确率基本在90~·95这附近。
# -*- coding:utf-8 -*-
import matplotlib.pyplot as plt
import numpy
import numpy as np
import scipy.io as sio
import matplotlib
import scipy.optimize as opt
from sklearn.metrics import classification_report
def load_data(path, transpose=True):
"""
读取数据
:param path:
:param transpose:
:return:
"""
data = sio.loadmat(path)
y = data.get('y')
# 将y由列向量变成行向量
y = y.reshape(y.shape[0])
X = data.get('X') # 5000*400
if transpose:
# 转置当前数据中,可以理解为将图片旋转,将数据分为5000个20*20的矩阵
X = numpy.array([im.reshape((20, 20)).T for im in X])
# 将旋转后的图片展开成一行 5000*400
X = numpy.array([im.reshape(400) for im in X])
return X, y
X, _ = load_data('./data/ex4data1.mat', transpose=False)
def plot_100_image(X):
"""
#绘图函数,画100张图片
X :
"""
# 获得图片大小(根号下400)
size = int(numpy.sqrt(X.shape[1]))
# 随机从X中选择100组数据
sample_idx = numpy.random.choice(numpy.arange(X.shape[0]), 100)
# 取数随机数据
sample_images = X[sample_idx, :]
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))
for r in range(10):
for c in range(10):
ax_array[r, c].matshow(sample_images[10 * r + c].reshape((size, size)),
cmap=matplotlib.cm.binary)
plt.xticks(numpy.array([]))
plt.yticks(numpy.array([]))
# 随机绘制100张图片
# plot_100_image(X)
# plt.show()
"""
前馈神经网络算法
"""
X_raw, y_raw = load_data('./data/ex4data1.mat', transpose=False)
# 增加全部为0的一列 (5000, 401)
X = numpy.insert(X_raw, 0, numpy.ones(X_raw.shape[0]), axis=1)
def expand_y(y):
"""
处理结果向量
将1 2 这种结果转化为 0 1 0 0 0 0 0 这种向量
:param y:
:return:
"""
res = []
for i in y:
y_array = numpy.zeros(10)
y_array[i - 1] = 1
res.append(y_array)
return numpy.array(res)
# 获得结果集处理后的目标向量
y = expand_y(y_raw)
def load_weight(path):
"""
读取权重
:param path:
:return:
"""
data = sio.loadmat(path)
return data['Theta1'], data['Theta2']
t1, t2 = load_weight('./data/ex4weights.mat')
def serialize(a, b):
"""
当我们使用高级优化方法来优化神经网络时,我们需要将多个参数矩阵展开,才能传入优化函数,然后再恢复形状。
序列化2矩阵
这个方法的目的就是使参数扁平化
# 在这个nn架构中,我们有theta1(25,401),theta2(10,26),它们的梯度是delta1,delta2
:param a:
:param b:
:return:
"""
# concatenate 方法可以完成数组拼接
# ravel 方法可以将数据扁平化
# 这个有个类似方法叫 flatten 该方法也可以做数据扁平化,和raval的区别是ravel返回的是引用,对ravel后的
# 对象进行修改会影响到原数据,而flatten则会先copy 不会对原数据产生影响。
return numpy.concatenate((numpy.ravel(a), numpy.ravel(b)))
# 扁平化参数,25*401+10*26=10285
theta = serialize(t1, t2)
def feed_forward(theta, X):
"""apply to architecture 400+1 * 25+1 *10
X: 5000 * 401
"""
t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
a1 = X # 5000 * 401
z2 = a1 @ t1.T # 5000 * 25
a2 = numpy.insert(sigmoid(z2), 0, numpy.ones(m), axis=1) # 5000*26
z3 = a2 @ t2.T # 5000 * 10
h = sigmoid(z3) # 5000*10, this is h_theta(X)
return a1, z2, a2, z3, h # you need all those for backprop
def sigmoid(z):
return 1 / (1 + numpy.exp(-z))
def deserialize(seq):
"""
将扁平化处理之后的数据恢复到之前的样子(两个theta
:param seq:
:return:
"""
# """into ndarray of (25, 401), (10, 26)"""
return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)
# h是计算后的结果集
# _, _, _, _, h = feed_forward(theta, X)
# print(h)
def cost(theta, X, y):
"""
代价函数
:param theta:
:param X:
:param y:
:return:
"""
m = X.shape[0] # get the data size m
_, _, _, _, h = feed_forward(theta, X)
# h是已经激活之后的结果集 所以这里h不需要进行激活
pair_computation = -numpy.multiply(y, numpy.log(h)) - numpy.multiply((1 - y), numpy.log(1 - h))
return pair_computation.sum() / m
# cost(theta, X, y)
# 0.287629165161
# print(cost(theta, X, y))
"""
负反馈神经网络
"""
def regularized_cost(theta, X, y, l=1):
"""
正则化代价函数
:param theta:
:param X:
:param y:
:param l:
:return:
"""
t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
# 忽略第一项,计算t1和t2的代价
reg_t1 = (l / (2 * m)) * numpy.power(t1[:, 1:], 2).sum()
reg_t2 = (l / (2 * m)) * numpy.power(t2[:, 1:], 2).sum()
cost1 = cost(theta, X, y)
a1 = cost(theta, X, y) + reg_t1 + reg_t2
return cost(theta, X, y) + reg_t1 + reg_t2
# 0.383769859091
# print(regularized_cost(theta, X, y))
def sigmoid_gradient(z):
"""
sigmoid的导函数,梯度下降时使用
"""
return numpy.multiply(sigmoid(z), 1 - sigmoid(z))
# 0.25
# print(sigmoid_gradient(0))
def gradient(theta, X, y):
"""
梯度下降
:param theta:
:param X:
:param y:
:return:
"""
# 将扁平化的数据拆分
t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
# 初始化两个delta
delta1 = numpy.zeros(t1.shape) # (25, 401)
delta2 = numpy.zeros(t2.shape) # (10, 26)
# 分别是输入层 隐藏层激活前 隐藏层激活后 输出层激活前 输出层激活后
a1, z2, a2, z3, h = feed_forward(theta, X)
# 对每一组数据进行遍历
for i in range(m):
a1i = a1[i, :] # (1, 401)
z2i = z2[i, :] # (1, 25)
a2i = a2[i, :] # (1, 26)
hi = h[i, :] # (1, 10)
yi = y[i, :] # (1, 10)
# 输出层误差
d3i = hi - yi # (1, 10)
z2i = numpy.insert(z2i, 0, numpy.ones(1)) # make it (1, 26) to compute d2i
# 隐藏层误差
# d2i = numpy.multiply(t2.T @ d3i, sigmoid_gradient(z2i)) # (1, 26)
# 和上面等价
# 注意矩阵乘法和矩阵点乘的区别 1,26 * 1,26 = 1,26
d2i = (t2.T @ d3i) * sigmoid_gradient(z2i)
# careful with np vector transpose
# 参数负反馈,注意矩阵转置
delta2 += numpy.matrix(d3i).T @ numpy.matrix(a2i) # (1, 10).T @ (1, 26) -> (10, 26)
delta1 += numpy.matrix(d2i[1:]).T @ numpy.matrix(a1i) # (1, 25).T @ (1, 401) -> (25, 401)
delta1 = delta1 / m
delta2 = delta2 / m
return serialize(delta1, delta2)
# d1, d2 = deserialize(gradient(theta, X, y))
def regularized_gradient(theta, X, y, l=1):
"""
正则化梯度下降
don't regularize theta of bias terms
不正则theta的偏置
"""
m = X.shape[0]
# 获取到负反馈之后计算到的两个网络参数
# 这里delta 为经过梯度下降计算后的参数
# t 为经过计算前的参数 然后用delta和t计算误差项
delta1, delta2 = deserialize(gradient(theta, X, y))
t1, t2 = deserialize(theta)
# 不惩罚第一项:将最前面添加的一列置0,不加入计算(0*n)
# 这里对t1、reg term d1 的操作我理解是在做正则化,加上一个原数据的比例,防止数据差距过大
# 但是这里去掉加上reg term ,正确率不但没有下降,甚至觉得小涨...
# 可能这个正则化也不是每个都需要的吧
t1[:, 0] = 0
reg_term_d1 = (l / m) * t1
# 修改delta参数
delta1 = delta1 + reg_term_d1
t2[:, 0] = 0
reg_term_d2 = (l / m) * t2
delta2 = delta2 + reg_term_d2
delta1_sum = numpy.sum(delta1)
delta2_sum = numpy.sum(delta2)
a1 = np.sum(serialize(delta1, delta2))
return serialize(delta1, delta2)
# 这个运行很慢,谨慎运行
# gradient_checking(theta, X, y, epsilon=0.0001, regularized=True)
def random_init(size):
"""
随机初始化
:param size:
:return:
"""
return np.random.uniform(-0.12, 0.12, size)
def nn_training(X, y):
"""regularized version
the architecture is hard coded here... won't generalize
网络训练
正则版本
"""
# 初始化参数
init_theta = random_init(10285) # 25*401 + 10*26
# init_theta = np.zeros(10285)
# 使用scipy计算代价最小的点
res = opt.minimize(fun=regularized_cost,
x0=init_theta,
args=(X, y, 1),
jac=regularized_gradient,
method='TNC')
return res
res = nn_training(X, y) # 慢
print(res)
_, y_answer = load_data('./data/ex4data1.mat')
print(y_answer[:20])
final_theta = res.x
def show_accuracy(theta, X, y):
"""
输出准确率
:param theta:
:param X:
:param y:
:return:
"""
_, _, _, _, h = feed_forward(theta, X)
y_pred = np.argmax(h, axis=1) + 1
print(classification_report(y, y_pred))
"""
precision recall f1-score support
1 0.97 0.99 0.98 500
2 0.99 0.94 0.96 500
3 0.90 0.98 0.94 500
4 1.00 0.68 0.81 500
5 1.00 0.46 0.63 500
6 0.90 0.99 0.94 500
7 1.00 0.78 0.88 500
8 0.78 1.00 0.88 500
9 0.63 0.99 0.77 500
10 0.94 1.00 0.97 500
accuracy 0.88 5000
macro avg 0.91 0.88 0.88 5000
weighted avg 0.91 0.88 0.88 5000
precision recall f1-score support
1 0.98 0.97 0.97 500
2 0.90 0.99 0.94 500
3 0.95 0.94 0.94 500
4 0.98 0.96 0.97 500
5 1.00 0.81 0.89 500
6 0.99 0.96 0.98 500
7 0.98 0.94 0.96 500
8 0.79 0.99 0.88 500
9 0.98 0.91 0.94 500
10 0.97 0.99 0.98 500
accuracy 0.95 5000
macro avg 0.95 0.95 0.95 5000
weighted avg 0.95 0.95 0.95 5000
"""
show_accuracy(final_theta, X, y_raw)
def plot_hidden_layer(theta):
"""
绘制隐藏层
theta: (10285, )
"""
final_theta1, _ = deserialize(theta)
hidden_layer = final_theta1[:, 1:] # ger rid of bias term theta
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(5, 5))
for r in range(5):
for c in range(5):
ax_array[r, c].matshow(hidden_layer[5 * r + c].reshape((20, 20)),
cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plot_hidden_layer(final_theta)
plt.show()
整理了一下代码自己又重新打了一遍:
这里需要注意的就是在正则化的时候的参数l。。好多次都打成了1。
还有就是计算长度不能用参数t。。要用X,错了好几个地方
自己实现的代码如下:
# -*- coding:utf-8 -*-
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy.optimize as opt
from sklearn.metrics import classification_report
"""
相关方法
"""
def load_data(path):
"""
读取数据
:param path:
:param transpose:
:return:
"""
data = sio.loadmat(path)
y = data.get('y')
# 将y由列向量变成行向量
y = y.reshape(y.shape[0])
X = data.get('X') # 5000*400
return X, y
def sigmoid(Z):
"""
sigmoid 函数
:param Z:
:return:
"""
return 1 / (1 + np.exp(-Z))
def sigmoid_derivative(Z):
"""
sigmoid 的导函数
:param Z:
:return:
"""
# return sigmoid(Z) * (1 - sigmoid(Z))
return np.multiply(sigmoid(Z), 1 - sigmoid(Z))
def serialize(theta1, theta2):
"""
扁平化参数
:param theta1:
:param theta2:
:return:
"""
return np.concatenate((np.ravel(theta1), np.ravel(theta2)))
def deserialize(theta):
"""
反扁平化参数
:param theta:
:return:
"""
return theta[:25 * 401].reshape(25, 401), theta[25 * 401:].reshape(10, 26)
def feed_forward(theta, X):
"""
神经网络计算
:param theta:
:param X:
:return:
"""
# 反扁平化获取当前参数
t1, t2 = deserialize(theta)
# 计算隐藏层激活前数据
hiden_layar_init_data = X @ t1.T
# 激活隐藏层函数,并增加一列偏置
hiden_layar_final_data = np.insert(sigmoid(hiden_layar_init_data), 0, np.ones(X.shape[0]), axis=1)
# 计算输出层激活前数据
output_layar_init_data = hiden_layar_final_data @ t2.T
# 激活输出层数据
output_layar_final_data = sigmoid(output_layar_init_data)
# 分别返回这四个数据
return hiden_layar_init_data, hiden_layar_final_data, output_layar_init_data, output_layar_final_data
def gradient(theta, X, y, l=1):
"""
梯度下降函数
:return:
"""
# 反序列化theta
t1, t2 = deserialize(theta)
# 初始化两个个t1,t2一样大的变量,用来存放需要变化的量以进行负反馈
delta1 = np.zeros(t1.shape)
delta2 = np.zeros(t2.shape)
# 获取神经网络计算结果
hiden_layar_init_data, hiden_layar_final_data, \
output_layar_init_data, output_layar_final_data = feed_forward(theta, X)
data_length = X.shape[0]
for i in range(data_length):
# 计算输出层误差
output_layar_final_data_error = output_layar_final_data[i] - y[i]
# 对隐藏层处理前的数据添加偏置列
# hiden_layar_init_data_with_bias = np.insert(hiden_layar_init_data[i], 0, 1)
hiden_layar_init_data_with_bias = np.insert(hiden_layar_init_data[i], 0, np.ones(1))
# 计算隐藏层的误差
hiden_layar_final_data_error = (t2.T @ output_layar_final_data_error) \
* sigmoid_derivative(hiden_layar_init_data_with_bias)
# 对delta进行反馈计算
# delta1 += output_layar_final_data_error.T @ hiden_layar_final_data[i]
delta2 += np.matrix(output_layar_final_data_error).T @ np.matrix(hiden_layar_final_data[i])
delta1 += np.matrix(hiden_layar_final_data_error[1:]).T @ np.matrix(X[i])
delta1 = delta1 / data_length
delta2 = delta2 / data_length
return serialize(delta1, delta2)
def regularized_gradient(theta, X, y, l=1):
"""
正则化梯度下降函数
:param theta:
:param X:
:param y:
:param l:
:return:
"""
# 原始参数去扁平化
t1, t2 = deserialize(theta)
# 进行下降之后的参数,这里为了防止delta的参数差距过大,需要依据原数据(t1 t2)对参数进行正则化
delta1, delta2 = deserialize(gradient(theta, X, y))
# 数据个数
data_length = X.shape[0]
# 不惩罚第一项 所以t1、t2的第一项设为0
t1[:, 0] = 0
t2[:, 0] = 0
reg_t1 = (l / data_length) * t1
reg_t2 = (l / data_length) * t2
delta1 = delta1 + reg_t1
delta2 = delta2 + reg_t2
delta1_sum = np.sum(delta1)
delta2_sum = np.sum(delta2)
a2 = np.sum(serialize(delta1, delta2))
return serialize(delta1, delta2)
def cost(theta, X, y, l=1):
"""
代价计算函数
:param theta:
:param X:
:param y:
:param l:
:return:
"""
data_length = X.shape[0]
_, _, _, output_layar_final_data = feed_forward(theta, X)
pair_computation = -np.multiply(y, np.log(output_layar_final_data)) - np.multiply((1 - y), np.log(
1 - output_layar_final_data))
return pair_computation.sum() / data_length
# return np.sum(
# -(y * np.log(output_layar_final_data)) - ((1 - y) * np.log(1 - output_layar_final_data))) / data_length
def expand_y(y):
"""
处理结果向量
将1 2 这种结果转化为 0 1 0 0 0 0 0 这种向量
:param y:
:return:
"""
res = []
for i in y:
y_array = np.zeros(10)
y_array[i - 1] = 1
res.append(y_array)
return np.array(res)
def regularized_cost(theta, X, y, l=1):
"""
正则化代价计算函数
:param theta:
:param X:
:param y:
:param l:
:return:
"""
t1, t2 = deserialize(theta)
data_length = X.shape[0]
# 不惩罚第一项,将第一列去掉
reg_t1 = (l / (2 * data_length)) * np.power(t1[:, 1:], 2).sum()
reg_t2 = (l / (2 * data_length)) * np.power(t2[:, 1:], 2).sum()
cost1 = cost(theta, X, y)
a2 = cost(theta, X, y, l) + reg_t1 + reg_t2
return cost(theta, X, y, l) + reg_t1 + reg_t2
def show_accuracy(theta, X, y):
"""
输出准确率
:param theta:
:param X:
:param y:
:return:
"""
_, _, _, h = feed_forward(theta, X)
y_pred = np.argmax(h, axis=1) + 1
print(classification_report(y, y_pred))
def random_init(size):
"""
随机初始化
:param size:
:return:
"""
return np.random.uniform(-0.12, 0.12, size)
def network_training(X, y):
"""
神经网络训练
:param X:
:param y:
:return:
"""
init_theta = random_init(401 * 25 + 26 * 10)
# init_theta = np.zeros(10285)
res = opt.minimize(fun=regularized_cost,
x0=init_theta,
args=(X, y, 1),
jac=regularized_gradient,
method='TNC')
return res
def plot_hidden_layer(theta):
"""
绘制隐藏层
theta: (10285, )
"""
final_theta1, _ = deserialize(theta)
hidden_layer = final_theta1[:, 1:] # ger rid of bias term theta
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(5, 5))
for r in range(5):
for c in range(5):
ax_array[r, c].matshow(hidden_layer[5 * r + c].reshape((20, 20)),
cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
if __name__ == '__main__':
X, y = load_data('./data/ex4data1.mat')
y_raw = expand_y(y)
X = np.insert(X, 0, np.ones(X.shape[0]), axis=1)
res = network_training(X, y_raw)
print(res)
show_accuracy(res.x, X, y)
plot_hidden_layer(res.x)
plt.show()
0 条评论