title | description | ms.service | ms.custom | ms.topic | ms.date |
---|---|---|---|---|---|
MNIST database of handwritten digits |
Learn how to use the MNIST database of handwritten digits dataset in Azure Open Datasets. |
open-datasets |
event-tier1-build-2022 |
sample |
04/16/2021 |
The MNIST database of handwritten digits has a training set of 60,000 examples and a test set of 10,000 examples. The digits have been size-normalized and centered in a fixed-size image.
[!INCLUDE Open Dataset usage notice]
This dataset is sourced from THE MNIST DATABASE of handwritten digits. It's a subset of the larger NIST Hand-printed Forms and Characters Database published by National Institute of Standards and Technology.
- Blob account: azureopendatastorage
- Container name: mnist
Four files are available in the container directly:
- train-images-idx3-ubyte.gz: training set images (9,912,422 bytes)
- train-labels-idx1-ubyte.gz: training set labels (28,881 bytes)
- t10k-images-idx3-ubyte.gz: test set images (1,648,877 bytes)
- t10k-labels-idx1-ubyte.gz: test set labels (4,542 bytes)
For more information on Azure Machine Learning datasets, see Create Azure Machine Learning datasets.
from azureml.opendatasets import MNIST
mnist = MNIST.get_tabular_dataset()
mnist_df = mnist.to_pandas_dataframe()
mnist_df.info()
mnist_train = MNIST.get_tabular_dataset(dataset_filter='train')
mnist_train_df = mnist_train.to_pandas_dataframe()
X_train = mnist_train_df.drop("label", axis=1).astype(int).values/255.0
y_train = mnist_train_df.filter(items=["label"]).astype(int).values
mnist_test = MNIST.get_tabular_dataset(dataset_filter='test')
mnist_test_df = mnist_test.to_pandas_dataframe()
X_test = mnist_test_df.drop("label", axis=1).astype(int).values/255.0
y_test = mnist_test_df.filter(items=["label"]).astype(int).values
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
# now let's show some randomly chosen images from the traininng set.
count = 0
sample_size = 30
plt.figure(figsize=(16, 6))
for i in np.random.permutation(X_train.shape[0])[:sample_size]:
count = count + 1
plt.subplot(1, sample_size, count)
plt.axhline('')
plt.axvline('')
plt.text(x=10, y=-10, s=y_train[i], fontsize=18)
plt.imshow(X_train[i].reshape(28, 28), cmap=plt.cm.Greys)
plt.show()
This works only for Linux based compute. For more information on Azure Machine Learning datasets, see Create Azure Machine Learning datasets.
mnist_file = MNIST.get_file_dataset()
mnist_file
mnist_file.to_path()
import os
import tempfile
data_folder = tempfile.mkdtemp()
data_paths = mnist_file.download(data_folder, overwrite=True)
data_paths
import gzip
import struct
import pandas as pd
import numpy as np
# load compressed MNIST gz files and return pandas dataframe of numpy arrays
def load_data(filename, label=False):
with gzip.open(filename) as gz:
gz.read(4)
n_items = struct.unpack('>I', gz.read(4))
if not label:
n_rows = struct.unpack('>I', gz.read(4))[0]
n_cols = struct.unpack('>I', gz.read(4))[0]
res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
res = res.reshape(n_items[0], n_rows * n_cols)
else:
res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
res = res.reshape(n_items[0], 1)
return pd.DataFrame(res)
import sys
mount_point = tempfile.mkdtemp()
print(mount_point)
print(os.path.exists(mount_point))
if sys.platform == 'linux':
print("start mounting....")
with mnist_file.mount(mount_point):
print("list dir...")
print(os.listdir(mount_point))
print("get the dataframe info of mounted data...")
train_images_df = load_data(next(path for path in data_paths if path.endswith("train-images-idx3-ubyte.gz")))
print(train_images_df.info())
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import os
data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)
urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-images-idx3-ubyte.gz',
filename=os.path.join(data_folder, 'train-images.gz'))
urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/train-labels-idx1-ubyte.gz',
filename=os.path.join(data_folder, 'train-labels.gz'))
urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-images-idx3-ubyte.gz',
filename=os.path.join(data_folder, 'test-images.gz'))
urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-labels-idx1-ubyte.gz',
filename=os.path.join(data_folder, 'test-labels.gz'))
import gzip
import struct
# load compressed MNIST gz files and return numpy arrays
def load_data(filename, label=False):
with gzip.open(filename) as gz:
struct.unpack('I', gz.read(4))
n_items = struct.unpack('>I', gz.read(4))
if not label:
n_rows = struct.unpack('>I', gz.read(4))[0]
n_cols = struct.unpack('>I', gz.read(4))[0]
res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
res = res.reshape(n_items[0], n_rows * n_cols)
else:
res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
res = res.reshape(n_items[0], 1)
return res
# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the model converge faster.
X_train = load_data(os.path.join(
data_folder, 'train-images.gz'), False) / 255.0
X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / 255.0
y_train = load_data(os.path.join(
data_folder, 'train-labels.gz'), True).reshape(-1)
y_test = load_data(os.path.join(
data_folder, 'test-labels.gz'), True).reshape(-1)
# now let's show some randomly chosen images from the traininng set.
count = 0
sample_size = 30
plt.figure(figsize=(16, 6))
for i in np.random.permutation(X_train.shape[0])[:sample_size]:
count = count + 1
plt.subplot(1, sample_size, count)
plt.axhline('')
plt.axvline('')
plt.text(x=10, y=-10, s=y_train[i], fontsize=18)
plt.imshow(X_train[i].reshape(28, 28), cmap=plt.cm.Greys)
plt.show()
For more information on Azure Machine Learning datasets, see Create Azure Machine Learning datasets.
# This is a package in preview.
from azureml.opendatasets import MNIST
mnist = MNIST.get_tabular_dataset()
mnist_df = mnist.to_spark_dataframe()
display(mnist_df.limit(5))
This works only for Linux based compute. For more information on Azure Machine Learning datasets, see Create Azure Machine Learning datasets.
mnist_file = MNIST.get_file_dataset()
mnist_file
mnist_file.to_path()
import os
import tempfile
mount_point = tempfile.mkdtemp()
mnist_file.download(mount_point, overwrite=True)
import gzip
import struct
import pandas as pd
import numpy as np
# load compressed MNIST gz files and return numpy arrays
def load_data(filename, label=False):
with gzip.open(filename) as gz:
gz.read(4)
n_items = struct.unpack('>I', gz.read(4))
if not label:
n_rows = struct.unpack('>I', gz.read(4))[0]
n_cols = struct.unpack('>I', gz.read(4))[0]
res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
res = res.reshape(n_items[0], n_rows * n_cols)
else:
res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
res = res.reshape(n_items[0], 1)
return pd.DataFrame(res)
import sys
mount_point = tempfile.mkdtemp()
print(mount_point)
print(os.path.exists(mount_point))
print(os.listdir(mount_point))
if sys.platform == 'linux':
print("start mounting....")
with mnist_file.mount(mount_point):
print(context.mount_point )
print(os.listdir(mount_point))
train_images_df = load_data(os.path.join(mount_point, 'train-images-idx3-ubyte.gz'))
print(train_images_df.info())
Sample not available for this platform/package combination.
View the rest of the datasets in the Open Datasets catalog.