from sklearn.metrics import accuracy_score
import tensorflow as tf
设置种子值,以便我们可以控制模型的随机性:
# To stop potential randomness
seed = 128
rng = np.random.RandomState(seed)
第一步是设置目录路径,以便保管!
root_dir = os.path.abspath('../..')
data_dir = os.path.join(root_dir, 'data')
sub_dir = os.path.join(root_dir, 'sub')
# check for existence
os.path.exists(root_dir)
os.path.exists(data_dir)
os.path.exists(sub_dir)
16
我们来看看我们的数据集。这些格式为CSV格式,并具有相应标签的文件名:
train = pd.read_csv(os.path.join(data_dir, 'Train', 'train.csv'))
test = pd.read_csv(os.path.join(data_dir, 'Test.csv'))
sample_submission = pd.read_csv(os.path.join(data_dir, 'Sample_Submission.csv'))
train.head()
让我们看看我们的数据是什么样的!我们读取我们的图像并展示它。
17
img_name = rng.choice(train.filename)
filepath = os.path.join(data_dir, 'Train', 'Images', 'train', img_name)
img = imread(filepath, flatten=True)
pylab.imshow(img, cmap='gray')
pylab.axis('off')
pylab.show()
上面的图像被表示为numpy数组,如下所示:
18
为了方便数据操作,我们将所有图像存储为numpy数组:
temp = []
for img_name in train.filename:
image_path = os.path.join(data_dir, 'Train', 'Images', 'train', img_name)
img = imread(image_path, flatten=True)
img = img.astype('float32')
temp.append(img)
train_x = np.stack(temp)
temp = []
for img_name in test.filename:
image_path = os.path.join(data_dir,
由于这是一个典型的ML问题,为了测试我们模型的正常运行,我们创建一个验证集。我们采取70:30的分组大小,用于训练集与验证集对比:
split_size = int(train_x.shape[0]*0.7)
19
train_x, val_x = train_x[:split_size], train_x[split_size:]
train_y, val_y = train.label.values[:split_size], train.label.values[split_size:]
现在,我们定义一些辅助函数,我们稍后使用它们:
defdense_to_one_hot(labels_dense, num_classes=10):
\
num_labels = labels_dense.shape[0]
index_offset = np.arange(num_labels) * num_classes
labels_one_hot = np.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
defpreproc(unclean_batch_x):
\
temp_batch = unclean_batch_x / unclean_batch_x.max()
return temp_batch
defbatch_creator(batch_size, dataset_length, dataset_name):
\
20