!mkdir -p data
!wget http://ufldl.stanford.edu/housenumbers/train.tar.gz -O data/train.tar.gz
!wget http://ufldl.stanford.edu/housenumbers/test.tar.gz -O data/test.tar.gz
!wget http://ufldl.stanford.edu/housenumbers/extra.tar.gz -O data/extra.tar.gz
!wget http://ufldl.stanford.edu/housenumbers/train_32x32.mat -O data/train_32x32.mat
!wget http://ufldl.stanford.edu/housenumbers/test_32x32.mat -O data/test_32x32.mat
!wget http://ufldl.stanford.edu/housenumbers/extra_32x32.mat -O data/extra_32x32.mat
--2016-12-12 11:15:24--  http://ufldl.stanford.edu/housenumbers/train.tar.gz
Resolving ufldl.stanford.edu (ufldl.stanford.edu)... 171.64.68.10
Connecting to ufldl.stanford.edu (ufldl.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 404141560 (385M) [application/x-gzip]
Saving to: 'data/train.tar.gz'

100%[======================================>] 404,141,560 6.58MB/s   in 58s    

2016-12-12 11:16:22 (6.62 MB/s) - 'data/train.tar.gz' saved [404141560/404141560]

--2016-12-12 11:16:22--  http://ufldl.stanford.edu/housenumbers/test.tar.gz
Resolving ufldl.stanford.edu (ufldl.stanford.edu)... 171.64.68.10
Connecting to ufldl.stanford.edu (ufldl.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 276555967 (264M) [application/x-gzip]
Saving to: 'data/test.tar.gz'

100%[======================================>] 276,555,967 10.7MB/s   in 28s    

2016-12-12 11:16:50 (9.34 MB/s) - 'data/test.tar.gz' saved [276555967/276555967]

--2016-12-12 11:16:51--  http://ufldl.stanford.edu/housenumbers/extra.tar.gz
Resolving ufldl.stanford.edu (ufldl.stanford.edu)... 171.64.68.10
Connecting to ufldl.stanford.edu (ufldl.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1955489752 (1.8G) [application/x-gzip]
Saving to: 'data/extra.tar.gz'

100%[====================================>] 1,955,489,752 11.3MB/s   in 4m 29s 

2016-12-12 11:21:20 (6.94 MB/s) - 'data/extra.tar.gz' saved [1955489752/1955489752]

--2016-12-12 11:21:20--  http://ufldl.stanford.edu/housenumbers/train_32x32.mat
Resolving ufldl.stanford.edu (ufldl.stanford.edu)... 171.64.68.10
Connecting to ufldl.stanford.edu (ufldl.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182040794 (174M) [text/plain]
Saving to: 'data/train_32x32.mat'

100%[======================================>] 182,040,794 11.3MB/s   in 15s    

2016-12-12 11:21:35 (11.5 MB/s) - 'data/train_32x32.mat' saved [182040794/182040794]

--2016-12-12 11:21:35--  http://ufldl.stanford.edu/housenumbers/test_32x32.mat
Resolving ufldl.stanford.edu (ufldl.stanford.edu)... 171.64.68.10
Connecting to ufldl.stanford.edu (ufldl.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64275384 (61M) [text/plain]
Saving to: 'data/test_32x32.mat'

100%[======================================>] 64,275,384  13.3MB/s   in 4.9s   

2016-12-12 11:21:40 (12.6 MB/s) - 'data/test_32x32.mat' saved [64275384/64275384]

--2016-12-12 11:21:40--  http://ufldl.stanford.edu/housenumbers/extra_32x32.mat
Resolving ufldl.stanford.edu (ufldl.stanford.edu)... 171.64.68.10
Connecting to ufldl.stanford.edu (ufldl.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1329278602 (1.2G) [text/plain]
Saving to: 'data/extra_32x32.mat'

100%[====================================>] 1,329,278,602 6.50MB/s   in 2m 50s 

2016-12-12 11:24:30 (7.47 MB/s) - 'data/extra_32x32.mat' saved [1329278602/1329278602]
from __future__ import print_function, division
from scipy.io import loadmat as load
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# !mkdir -p data
# !wget http://ufldl.stanford.edu/housenumbers/train.tar.gz -O data/train.tar.gz
# !wget http://ufldl.stanford.edu/housenumbers/test.tar.gz -O data/test.tar.gz
# !wget http://ufldl.stanford.edu/housenumbers/extra.tar.gz -O data/extra.tar.gz
# !wget http://ufldl.stanford.edu/housenumbers/train_32x32.mat -O data/train_32x32.mat
# !wget http://ufldl.stanford.edu/housenumbers/test_32x32.mat -O data/test_32x32.mat
# !wget http://ufldl.stanford.edu/housenumbers/extra_32x32.mat -O data/extra_32x32.mat
from __future__ import print_function, division
from scipy.io import loadmat as load
import matplotlib.pyplot as plt
import numpy as np

def reformat(samples, labels):
	# 改变原始数据的形状
	#  0       1       2      3          3       0       1      2
	# (图片高,图片宽,通道数,图片数) -> (图片数,图片高,图片宽,通道数)
	new = np.transpose(samples, (3, 0, 1, 2)).astype(np.float32)

	# labels 变成 one-hot encoding, [2] -> [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
	# digit 0 , represented as 10
	# labels 变成 one-hot encoding, [10] -> [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	labels = np.array([x[0] for x in labels])	# slow code, whatever
	one_hot_labels = []
	for num in labels:
		one_hot = [0.0] * 10
		if num == 10:
			one_hot[0] = 1.0
		else:
			one_hot[num] = 1.0
		one_hot_labels.append(one_hot)
	labels = np.array(one_hot_labels).astype(np.float32)
	return new, labels

def normalize(samples):
	'''
	并且灰度化: 从三色通道 -> 单色通道     省内存 + 加快训练速度
	(R + G + B) / 3
	将图片从 0 ~ 255 线性映射到 -1.0 ~ +1.0
	@samples: numpy array
	'''
	a = np.add.reduce(samples, keepdims=True, axis=3)  # shape (图片数,图片高,图片宽,通道数)
	a = a/3.0
	return a/128.0 - 1.0


def distribution(labels, name):
	# 查看一下每个label的分布,再画个统计图
	# keys:
	# 0
	# 1
	# 2
	# ...
	# 9
	count = {}
	for label in labels:
		key = 0 if label[0] == 10 else label[0]
		if key in count:
			count[key] += 1
		else:
			count[key] = 1
	x = []
	y = []
	for k, v in count.items():
		# print(k, v)
		x.append(k)
		y.append(v)

	y_pos = np.arange(len(x))
	plt.bar(y_pos, y, align='center', alpha=0.5)
	plt.xticks(y_pos, x)
	plt.ylabel('Count')
	plt.title(name + ' Label Distribution')
	plt.show()

def inspect(dataset, labels, i):
	# 显示图片看看
	if dataset.shape[3] == 1:
		shape = dataset.shape
		dataset = dataset.reshape(shape[0], shape[1], shape[2])
	print(labels[i])
	plt.imshow(dataset[i])
	plt.show()


train = load('data/train_32x32.mat')
test = load('data/test_32x32.mat')
# extra = load('data/extra_32x32.mat')

print('Train Samples Shape:', train['X'].shape)
print('Train  Labels Shape:', train['y'].shape)
print('Test Samples Shape:', test['X'].shape)
print('Test  Labels Shape:', test['y'].shape)
# print('Extra Samples Shape:', extra['X'].shape)
# print('Extra  Labels Shape:', extra['y'].shape)

train_samples = train['X']
train_labels = train['y']
test_samples = test['X']
test_labels = test['y']
# extra_samples = extra['X']
# extra_labels = extra['y']

n_train_samples, n_train_labels = reformat(train_samples, train_labels)
n_test_samples, n_test_labels = reformat(test_samples, test_labels)

_train_dataset = normalize(n_train_samples)
_test_dataset = normalize(n_test_samples)

if __name__ == '__main__':
	inspect(_train_dataset, n_train_labels, 1234)
	inspect(n_train_samples, n_train_labels, 1234)
	distribution(train_labels, 'Train Labels')
	distribution(test_labels, 'Test Labels')

Train Samples Shape: (32, 32, 3, 73257)
Train  Labels Shape: (73257, 1)
Test Samples Shape: (32, 32, 3, 26032)
Test  Labels Shape: (26032, 1)
[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]

png

[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]

png

png

png

Pengfei Ni ©2021