import gzipimport numpy as npimport structdefparse_mnist(image_filename,label_filename):with gzip.open(image_filename,'rb')as image_file, gzip.open(label_filename,'rb')as label_file:# Read magic numbers from image and label files image_magic = struct.unpack('>I', image_file.read(4))[0] label_magic = struct.unpack('>I', label_file.read(4))[0]# Verify magic numbersif image_magic !=2051or label_magic !=2049:raiseValueError("Invalid MNIST file format")# Read number of images and labels num_images = struct.unpack('>I', image_file.read(4))[0] num_labels = struct.unpack('>I', label_file.read(4))[0]# Verify that the number of images and labels matchif num_images != num_labels:raiseValueError("Number of images and labels do not match")# Read image dimensions num_rows = struct.unpack('>I', image_file.read(4))[0] num_cols = struct.unpack('>I', image_file.read(4))[0] input_dim = num_rows * num_cols# Read images and labels into numpy arrays X = np.frombuffer(image_file.read(),dtype=np.uint8).reshape(num_images, input_dim) y = np.frombuffer(label_file.read(),dtype=np.uint8)# Normalize X to have values between 0.0 and 1.0 X = X.astype(np.float32)/255.0return X, y
In this code, Z represents the logit predictions for each class, and y represents the true labels. Here's a breakdown of the steps:
Calculate the exponential of Z using np.exp(Z).
Compute the sum of the exponential values along the axis 1 (which represents the classes) using np.sum(exp_Z, axis=1, keepdims=True).
Divide each element of exp_Z by the sum to get the softmax probabilities for each example.
Select the probabilities for the correct class using advanced indexing probs[np.arange(batch_size), y], where np.arange(batch_size) generates an array [0, 1, ..., batch_size-1] to select the correct elements for each example.
Compute the negative logarithm of the correct probabilities using -np.log(correct_probs).
Finally, calculate the average loss over the batch by taking the mean of the log probabilities using np.mean(log_probs).
def softmax_loss(Z, y):
""" Return softmax loss. Note that for the purposes of this assignment,
you don't need to worry about "nicely" scaling the numerical properties
of the log-sum-exp computation, but can just compute this directly.
Args:
Z (np.ndarray[np.float32]): 2D numpy array of shape
(batch_size, num_classes), containing the logit predictions for
each class.
y (np.ndarray[np.int8]): 1D numpy array of shape (batch_size, )
containing the true label of each example.
Returns:
Average softmax loss over the sample.
"""
### BEGIN YOUR CODE
batch_size=Z.shape[0]
exp_Z=np.exp(Z)
probs=exp_Z/np.sum(exp_Z,axis=1,keepdims=True)
yprob=probs[np.arange(batch_size),y]
loss=-np.log(yprob)
ave_loss=np.mean(loss)
return ave_loss
def softmax_regression_epoch(X, y, theta, lr = 0.1, batch=100):
""" Run a single epoch of SGD for softmax regression on the data, using
the step size lr and specified batch size. This function should modify the
theta matrix in place, and you should iterate through batches in X _without_
randomizing the order.
Args:
X (np.ndarray[np.float32]): 2D input array of size
(num_examples x input_dim).
y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
theta (np.ndarrray[np.float32]): 2D array of softmax regression
parameters, of shape (input_dim, num_classes)
lr (float): step size (learning rate) for SGD
batch (int): size of SGD minibatch
Returns:
None
"""
### BEGIN YOUR CODE
num_example = X.shape[0]
num_batch=num_example//batch
num_class=theta.shape[1]
for i in range(num_batch):
X_batch=X[i*batch:(i+1)*batch]
y_batch=y[i*batch:(i+1)*batch]
Z=np.dot(X_batch,theta)
probs=np.exp(Z)/ np.sum(np.exp(Z),axis=1,keepdims=True)
I_y = np.zeros((batch, num_class))
I_y[np.arange(batch), y_batch] = 1
grad=np.dot(X_batch.T,(probs-I_y)) /batch
theta-=lr*grad
def nn_epoch(X, y, W1, W2, lr = 0.1, batch=100):
""" Run a single epoch of SGD for a two-layer neural network defined by the
weights W1 and W2 (with no bias terms):
logits = ReLU(X * W1) * W2
The function should use the step size lr, and the specified batch size (and
again, without randomizing the order of X). It should modify the
W1 and W2 matrices in place.
Args:
X (np.ndarray[np.float32]): 2D input array of size
(num_examples x input_dim).
y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
W1 (np.ndarray[np.float32]): 2D array of first layer weights, of shape
(input_dim, hidden_dim)
W2 (np.ndarray[np.float32]): 2D array of second layer weights, of shape
(hidden_dim, num_classes)
lr (float): step size (learning rate) for SGD
batch (int): size of SGD minibatch
Returns:
None
"""
### BEGIN YOUR CODE
num_example = X.shape[0]
num_batch=num_example//batch
num_class=W2.shape[1]
for i in range(num_batch):
X_batch=X[i*batch:(i+1)*batch]
y_batch=y[i*batch:(i+1)*batch]
Z1=relu(np.dot(X_batch,W1))
Z1W2=np.dot(Z1,W2)
probs=np.exp(Z1W2)/ np.sum(np.exp(Z1W2),axis=1,keepdims=True)
I_y = np.zeros((batch, num_class))
I_y[np.arange(batch), y_batch] = 1
G2=probs-I_y
G1=relu_derivative(Z1)*np.dot(G2,W2.T)
grad1=np.dot(X_batch.T,G1)/batch
grad2=np.dot(Z1.T,G2)/batch
W1-=lr*grad1
W2-=lr*grad2
### END YOUR CODE
void softmax_regression_epoch_cpp(const float *X, const unsigned char *y,
float *theta, size_t m, size_t n, size_t k,
float lr, size_t batch)
void matrix_multi(const float *M,const float*N,float *P,size_t a,size_t b,size_t c){
for (size_t row=0;row<a;row++){
for (size_t col=0;col<c;col++){
P[row*c+col]=0.0;
for (size_t mid = 0; mid < b; mid++)
{
P[row*c+col]+=M[row*b+mid]*N[mid*c+col];
}
}
}
}
void normalize(float *M,size_t a, size_t b ){
for (size_t row=0;row<a;row++){
float max=0.0;
float sum=0.0;
for (size_t col=0;col<b;col++){
max=std::max(max,M[row*b+col]);
}
for (size_t col=0;col<b;col++){
M[row*b+col]=std::exp(M[row*b+col]-max);
sum+= M[row*b+col];
}
for (size_t col=0;col<b;col++){
M[row*b+col]/=sum;
}
}
}
void transpose(const float * M,float * N,size_t a,size_t b){
for (size_t row=0;row<a;row++){
for (size_t col=0;col<b;col++){
N[col*a+row]=M[row*b+col];
}
}
}
void print(const float * M,size_t a,size_t b ){
for (size_t row=0;row<a;row++){
for (size_t col=0;col<b;col++){
std::cout<<M[row*b+col]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
}
void softmax_regression_epoch_cpp(const float *X, const unsigned char *y,
float *theta, size_t m, size_t n, size_t k,
float lr, size_t batch)
{
float * Z=new float[batch*k];//保存计算的logits
float * grad=new float[n*k];//保存计算的梯度
float * X_T=new float[n*batch];
for (size_t batch_start=0;batch_start<m;batch_start+=batch)
{
size_t batch_end=std::min(m,batch_start+batch);
//Z=normalize(X\theta)
matrix_multi(X+batch_start*n,theta,Z,batch_end-batch_start,n,k);
normalize(Z,batch_end-batch_start,k);
//Z-I_y
for (size_t i = 0; i <(batch_end - batch_start); i++)
{
Z[i*k+y[i + batch_start]] -= 1.0;
}
//X_T
transpose(X+batch_start*n,X_T,batch_end-batch_start,n);
//grad=X_T *(Z-I_y)
matrix_multi(X_T,Z,grad,n,batch_end-batch_start,k);
for (size_t i = 0; i < n*k; i++)
{
theta[i]-=lr*grad[i]/(batch_end-batch_start);
}
}
delete[] Z;
delete[] grad;
delete[] X_T;
}