Numpy

NumPy is a commonly used Python data analysis package. By using NumPy, you can speed up your workflow, and interface with other packages in the Python ecosystem, like scikit-learn, that use NumPy under the hood. NumPy was originally developed in the mid 2000s, and arose from an even older package called Numeric. This longevity means that almost every data analysis or machine learning package for Python leverages NumPy in some way.

Download Wine Quality Data Set

import requests

# download Wine Quality Data Set.
r = requests.get("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", stream=True)
if r.status_code == 200:
    with open("winequality-red.csv", 'wb') as f:
        for chunk in r.iter_content(1024):
            f.write(chunk)

Read dataset

import pandas as pd

wines = pd.read_csv("winequality-red.csv", ';')
print wines.shape
wines.head()
(1599, 12)
import csv
with open("winequality-red.csv", 'r') as f:
    wines = list(csv.reader(f, delimiter=";"))

# skip header
wines = np.array(wines[1:], dtype=np.float)
wines.shape
(1599, 12)
import numpy as np
wines = np.genfromtxt("winequality-red.csv", delimiter=";", skip_header=1)
wines.shape
(1599, 12)

Creating A NumPy Array

a = np.zeros((3,4))
print a
[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
a.reshape(2,6)
array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])
a.dtype
dtype('float64')
# create a array of random numbers
b = np.random.rand(3,4)
print b.dtype
print b
float64
[[ 0.47340722  0.26032462  0.93331738  0.34234496]
 [ 0.54765394  0.165398    0.51294266  0.4961539 ]
 [ 0.06106089  0.34025539  0.31058294  0.21797288]]
c = np.linspace(0, 50, 10)
print c
[  0.           5.55555556  11.11111111  16.66666667  22.22222222
  27.77777778  33.33333333  38.88888889  44.44444444  50.        ]
# convert data type
c.astype(int)
array([ 0,  5, 11, 16, 22, 27, 33, 38, 44, 50])

Array operations

wines[:, 11] + 10
array([ 15.,  15.,  15., ...,  16.,  15.,  16.])
wines[:, 11] += 10
wines[:,11] + wines[:,11]
array([ 30.,  30.,  30., ...,  32.,  30.,  32.])
# Let’s say we want to pick a wine that maximizes alcohol
# content and quality (we want to get drunk, but we’re classy).
# We’d multiply alcohol by quality, and select the wine with the highest score:
# Note: /, *, -, +, ^ performs element math for same size vectors.
wines[:,10] * wines[:,11]
array([ 141.,  147.,  147., ...,  176.,  153.,  176.])
# Broadcasting

array_one = np.array(
    [
        [1,2],
        [3,4]
    ]
)
array_two = np.array([4,5])

array_one + array_two
array([[5, 7],
       [7, 9]])
# Array methods

wines.sum()
168074.78193999999
# sums over the first axis of the array.
# This will give us the sum of all the values in every column.
wines.sum(axis=0)
array([ 13303.1    ,    843.985  ,    433.29   ,   4059.55   ,
          139.859  ,  25384.     ,  74302.     ,   1593.79794,
         5294.47   ,   1052.38   ,  16666.35   ,  25002.     ])
# If we pass in axis=1, we’ll find the sums over
# the second axis of the array. This will give us the sum of each row:
wines.sum(axis=1)
array([  84.5438 ,  133.0548 ,  109.699  , ...,  110.48174,  115.21547,
        102.49249])

Matrix

A = np.matrix('1.0 2.0; 3.0 4.0')
print A
[[ 1.  2.]
 [ 3.  4.]]
# transpose
A.T
matrix([[ 1.,  3.],
        [ 2.,  4.]])
# matrix multiplication
X = np.matrix('5.0 7.0')
A*X.T
matrix([[ 19.],
        [ 43.]])
# matrix inverse
A.I
matrix([[-2. ,  1. ],
        [ 1.5, -0.5]])

Vector Stacking

x = np.arange(0,10,2)                     # x=([0,2,4,6,8])
y = np.arange(5)                          # y=([0,1,2,3,4])
m = np.vstack([x,y])                      # m=([[0,2,4,6,8],
                                          #     [0,1,2,3,4]])
xy = np.hstack([x,y])                     # xy =([0,2,4,6,8,0,1,2,3,4])

Histograms

import numpy
import pylab
# Build a vector of 10000 normal deviates with variance 0.5^2 and mean 2
mu, sigma = 2, 0.5
v = numpy.random.normal(mu,sigma,10000)
# Plot a normalized histogram with 50 bins
pylab.hist(v, bins=50, normed=1)       # matplotlib version (plot)
pylab.show()
# Compute the histogram with numpy and then plot it
(n, bins) = numpy.histogram(v, bins=50, normed=True)  # NumPy version (no plot)
pylab.plot(.5*(bins[1:]+bins[:-1]), n)
pylab.show()

png

png