# Programming and Data Structures with Python
# Lecture 17, 22 November 2021

# Using numpy

- Arrays and lists
- Arrays are "homogenous" with regular structure
- Lists are flexible

## Load numpy

In [1]:
import numpy as np

In [2]:
a = np.array([1,2,3])
a

array([1, 2, 3])

In [3]:
b = np.array(range(10))
b

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
c = np.array([[0,1],[2,3]])
c

array([[0, 1],
       [2, 3]])

## Indexing and slicing

In [5]:
a = np.arange(10)**3
a

array([  0,   1,   8,  27,  64, 125, 216, 343, 512, 729])

In [6]:
a[2], a[2:5]

(8, array([ 8, 27, 64]))

In [7]:
a[:6:2] = -1000  # equivalent to a[0:6:2] = -1000
a

array([-1000,     1, -1000,    27, -1000,   125,   216,   343,   512,
         729])

In [8]:
def f(x,y):
    return(10*x +  y)

In [9]:
f(5,7)

57

In [10]:
b = np.fromfunction(f,(5,4),dtype=int)
b

array([[ 0,  1,  2,  3],
       [10, 11, 12, 13],
       [20, 21, 22, 23],
       [30, 31, 32, 33],
       [40, 41, 42, 43]])

In [11]:
b[2,3]  # Not b[2][3]

23

In [12]:
b[0:5, 1] # each row in the second column of b

array([ 1, 11, 21, 31, 41])

In [13]:
b[ : ,1]  # equivalent to the previous example

array([ 1, 11, 21, 31, 41])

In [14]:
b[1:3, :]  # each column in the second and third row of b

array([[10, 11, 12, 13],
       [20, 21, 22, 23]])

In [15]:
b[1:4,1:3]

array([[11, 12],
       [21, 22],
       [31, 32]])

## Iterating over elements

In [16]:
print(b)

[[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]
 [40 41 42 43]]


In [17]:
for row in b:
    print(row)

[0 1 2 3]
[10 11 12 13]
[20 21 22 23]
[30 31 32 33]
[40 41 42 43]


In [18]:
for element in b.flat:
    print(element,end=' ')

0 1 2 3 10 11 12 13 20 21 22 23 30 31 32 33 40 41 42 43 

## Stacking arrays

In [19]:
a = np.zeros((5,7),dtype=int)
a

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]])

In [20]:
a = np.floor(10*np.random.random((2,2)))
b = np.floor(10*np.random.random((2,2)))
print(a,b)

[[2. 8.]
 [2. 7.]] [[6. 7.]
 [1. 7.]]


In [21]:
np.vstack((a,b))

array([[2., 8.],
       [2., 7.],
       [6., 7.],
       [1., 7.]])

In [22]:
c = np.floor(10*np.random.random((3,3)))

In [23]:
c

array([[7., 5., 2.],
       [1., 9., 9.],
       [3., 9., 8.]])

In [24]:
np.vstack((a,c))

ValueError: ignored

In [26]:
np.hstack((a,b))

array([[2., 8., 6., 7.],
       [2., 7., 1., 7.]])

In [27]:
np.hstack((b,c))

ValueError: ignored

## Splitting arrays

In [29]:
a = np.floor(10*np.random.random((2,12)))
a

array([[9., 5., 7., 2., 3., 3., 6., 4., 0., 4., 8., 1.],
       [0., 6., 9., 5., 6., 4., 8., 7., 4., 7., 7., 1.]])

In [30]:
np.hsplit(a,6)

[array([[9., 5.],
        [0., 6.]]), array([[7., 2.],
        [9., 5.]]), array([[3., 3.],
        [6., 4.]]), array([[6., 4.],
        [8., 7.]]), array([[0., 4.],
        [4., 7.]]), array([[8., 1.],
        [7., 1.]])]

In [31]:
np.hsplit(a,(2,5,7)) # Split a after the third and the fourth column

[array([[9., 5.],
        [0., 6.]]), array([[7., 2., 3.],
        [9., 5., 6.]]), array([[3., 6.],
        [4., 8.]]), array([[4., 0., 4., 8., 1.],
        [7., 4., 7., 7., 1.]])]

In [32]:
np.vsplit(a,2) # Split a vertically

[array([[9., 5., 7., 2., 3., 3., 6., 4., 0., 4., 8., 1.]]),
 array([[0., 6., 9., 5., 6., 4., 8., 7., 4., 7., 7., 1.]])]

## Copy and view

In [33]:
c = a.copy()  # Creates a disjoint copy of the array
d = a.view()  # Creates another link to the same array

In [34]:
a, c, d

(array([[9., 5., 7., 2., 3., 3., 6., 4., 0., 4., 8., 1.],
        [0., 6., 9., 5., 6., 4., 8., 7., 4., 7., 7., 1.]]),
 array([[9., 5., 7., 2., 3., 3., 6., 4., 0., 4., 8., 1.],
        [0., 6., 9., 5., 6., 4., 8., 7., 4., 7., 7., 1.]]),
 array([[9., 5., 7., 2., 3., 3., 6., 4., 0., 4., 8., 1.],
        [0., 6., 9., 5., 6., 4., 8., 7., 4., 7., 7., 1.]]))

In [35]:
c[0,4] = 88

In [36]:
a, c, d

(array([[9., 5., 7., 2., 3., 3., 6., 4., 0., 4., 8., 1.],
        [0., 6., 9., 5., 6., 4., 8., 7., 4., 7., 7., 1.]]),
 array([[ 9.,  5.,  7.,  2., 88.,  3.,  6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5.,  6.,  4.,  8.,  7.,  4.,  7.,  7.,  1.]]),
 array([[9., 5., 7., 2., 3., 3., 6., 4., 0., 4., 8., 1.],
        [0., 6., 9., 5., 6., 4., 8., 7., 4., 7., 7., 1.]]))

In [37]:
d[0,5] = 66

In [38]:
a, c, d

(array([[ 9.,  5.,  7.,  2.,  3., 66.,  6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5.,  6.,  4.,  8.,  7.,  4.,  7.,  7.,  1.]]),
 array([[ 9.,  5.,  7.,  2., 88.,  3.,  6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5.,  6.,  4.,  8.,  7.,  4.,  7.,  7.,  1.]]),
 array([[ 9.,  5.,  7.,  2.,  3., 66.,  6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5.,  6.,  4.,  8.,  7.,  4.,  7.,  7.,  1.]]))

Use <tt>base</tt> to check whether two arrays have same underlying elements

In [39]:
c.base is a, d.base is a

(False, True)

Can reshape arrays with same base without affecting the shape of the other

In [40]:
d.shape = 4,6

In [41]:
a,c,d

(array([[ 9.,  5.,  7.,  2.,  3., 66.,  6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5.,  6.,  4.,  8.,  7.,  4.,  7.,  7.,  1.]]),
 array([[ 9.,  5.,  7.,  2., 88.,  3.,  6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5.,  6.,  4.,  8.,  7.,  4.,  7.,  7.,  1.]]),
 array([[ 9.,  5.,  7.,  2.,  3., 66.],
        [ 6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5.,  6.,  4.],
        [ 8.,  7.,  4.,  7.,  7.,  1.]]))

In [42]:
d[2,4] = 99

In [43]:
a, c, d

(array([[ 9.,  5.,  7.,  2.,  3., 66.,  6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5., 99.,  4.,  8.,  7.,  4.,  7.,  7.,  1.]]),
 array([[ 9.,  5.,  7.,  2., 88.,  3.,  6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5.,  6.,  4.,  8.,  7.,  4.,  7.,  7.,  1.]]),
 array([[ 9.,  5.,  7.,  2.,  3., 66.],
        [ 6.,  4.,  0.,  4.,  8.,  1.],
        [ 0.,  6.,  9.,  5., 99.,  4.],
        [ 8.,  7.,  4.,  7.,  7.,  1.]]))

In [44]:
a = np.array([[1,2],[3,4]])
b = np.array([[5,6],[7,8]])

In [45]:
a,b

(array([[1, 2],
        [3, 4]]), array([[5, 6],
        [7, 8]]))

In [46]:
a+b  # Pointwise addition

array([[ 6,  8],
       [10, 12]])

In [47]:
a*b  # Pointwise multiplication

array([[ 5, 12],
       [21, 32]])

In [49]:
np.matmul(a,b)  # Normal matrix multiplication

array([[19, 22],
       [43, 50]])

In [51]:
a.T   # Transpose

array([[1, 3],
       [2, 4]])

In [52]:
np.linalg.inv(a)  # Matrix inverse

array([[-2. ,  1. ],
       [ 1.5, -0.5]])

In [54]:
np.matmul(a,np.linalg.inv(a))  # a a^-1

array([[1.0000000e+00, 0.0000000e+00],
       [8.8817842e-16, 1.0000000e+00]])

# Pandas (Python and data analysis)
- Built on top of numpy

## Series and data frames

- Numpy defines homogeneous n-dimensional arrays

- Data science works with tables: 2-dimensional arrays

- Pandas has two fundamental data structures

    - Series : A column of data
    - Data Frame : A table of data

## Key difference
- Numpy indices are always [0..n-1] in each dimension
- Pandas allows more flexible “named” indices for rows and columns
    - Dictionary vs list

## Load pandas

- Don't need to import numpy unless one is separately using numpy arrays

In [55]:
import pandas as pd

## Create a series

- Convert a sequence into a series (column)

In [56]:
h = ('AA', '2012-02-01', 100, 10.2)
s = pd.Series(h)
type(s)

pandas.core.series.Series

In [57]:
s

0            AA
1    2012-02-01
2           100
3          10.2
dtype: object

- Convert a dictionary to a series
- Keys become "row indices"

In [58]:
d = {'name' : 'IBM', 'date' :'2010-09-08', 'shares' : 100, 'price' : 10.2}
ds = pd.Series(d)
type(ds)

pandas.core.series.Series

In [59]:
ds

name             IBM
date      2010-09-08
shares           100
price           10.2
dtype: object

## Creating an index

In [60]:
f = ['FB', '2001-08-02', 90, 3.2]
fs = pd.Series(f, index = ['name','date', 'shares', 'price'])

In [61]:
fs

name              FB
date      2001-08-02
shares            90
price            3.2
dtype: object

## Accessing elements

- Use named index, or position
- Use slices, sublists

In [62]:
fs['shares']

90

In [63]:
fs[0]

'FB'

In [64]:
fs[0:2]

name            FB
date    2001-08-02
dtype: object

In [65]:
fs[[0,2]]

name      FB
shares    90
dtype: object

In [66]:
fs['name':'price']

name              FB
date      2001-08-02
shares            90
price            3.2
dtype: object

## Data frames

- A table is a sequence of columns
- A data frame is a sequence of series

In [67]:
data1 = [ ['AA', 'IBM', 'GOOG'], 
         ['2001-12-01', '2012-02-10', '2010-04-09'],
         [100, 30, 90],
         [12.3, 10.3, 32.2]
       ]
df1 = pd.DataFrame(data1)

In [68]:
df1

Unnamed: 0,0,1,2
0,AA,IBM,GOOG
1,2001-12-01,2012-02-10,2010-04-09
2,100,30,90
3,12.3,10.3,32.2


In [69]:
data2 = {'name' : ['AA', 'IBM', 'GOOG'], 
        'date' : ['2001-12-01', '2012-02-10', '2010-04-09'],
        'shares' : [100, 30, 90],
        'price' : [12.3, 10.3, 32.2]
}
df2 = pd.DataFrame(data2)

In [70]:
df2

Unnamed: 0,name,date,shares,price
0,AA,2001-12-01,100,12.3
1,IBM,2012-02-10,30,10.3
2,GOOG,2010-04-09,90,32.2
