Pandas Tutorials¶

Part I : Basics¶

Basics1: Quickstart¶

import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24, 70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframe
df = pd.DataFrame(data, columns = labels)

# Others
print(df)

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

Basics2 : table¶

import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframes
df = pd.DataFrame(data, columns = labels)
B  = pd.DataFrame()	# an empty dataframe

# dealing with tables
print(df)
print()
print(df.empty)
print()
print(B.empty)
print()
print(df.ndim)
print()
print(df.size)
print()
print(df.shape)
print()
print(df.dtypes)

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

False

True

2

25

(5, 5)

first_name     object
last_name      object
age             int64
math            int64
english       float64
dtype: object

Basics3 : Rows¶

import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframes
df = pd.DataFrame(data, columns = labels)

# Dealing with rows
print(df[:2])
print()
print(df[:-2])
print()
print(df[2:4])
print()
print(df[-4:-2])

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0

  first_name last_name  age  math  english
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0

  first_name last_name  age  math  english
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0

Basics4 : Columns¶

import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframes
df = pd.DataFrame(data, columns = labels)


# Dealing with columns
print(df['last_name'])
print()
print(df['last_name'][2:])
print()
print(df['last_name'][-2:])
print()
print(df['last_name'][2:4])
print()
print(df['last_name'][-4:-2])

0     Sahli
1     Cooze
2      Baba
3       Bao
4    Joseph
Name: last_name, dtype: object

2      Baba
3       Bao
4    Joseph
Name: last_name, dtype: object

3       Bao
4    Joseph
Name: last_name, dtype: object

2    Baba
3     Bao
Name: last_name, dtype: object

1    Cooze
2     Baba
Name: last_name, dtype: object

Basics5 : Items¶

import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframes
df = pd.DataFrame(data, columns = labels)


# Dealing with items
print(df['age'][0])
print()
print(df['age'][3])
print()
print(df['age'].min())
print()
print(df['age'].max())
print()
print(df['age'].mean())
print()
print(df['age'].sum())

42

24

24

73

45.4

227

Part II : Tables¶

import numpy as np
import pandas as pd

# data and labels and pandas dataframes
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']

df = pd.DataFrame(data, columns = labels)


# Tables
print(df.values)	# convert Table to a Matrix
print()
print(df.head(2))
print()
print(df.tail(2))
print()
B = df.T			# transpose of the table
print(B)

[['Simo' 'Sahli' 42 4 25.0]
 ['Amy' 'Cooze' 52 24 94.0]
 ['Ali' 'Baba' 36 31 57.0]
 ['Jiarong' 'Bao' 24 70 62.0]
 ['Jacob' 'Joseph' 73 3 nan]]

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0

  first_name last_name  age  math  english
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

                0      1     2        3       4
first_name   Simo    Amy   Ali  Jiarong   Jacob
last_name   Sahli  Cooze  Baba      Bao  Joseph
age            42     52    36       24      73
math            4     24    31       70       3
english        25     94    57       62     NaN

import numpy as np
import pandas as pd

# data and labels and pandas dataframes
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']

data2 = [('koko', 'kaka' , 27, 90, 15),
         ('bobo', 'baba' , 25, 14, 97)]
df = pd.DataFrame(data, columns = labels)
df2= pd.DataFrame(data2, columns = labels)

# Tables Concatenation
print(df)
print()
print(df2)
df = df.append(df2)
print()
print(df)

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

  first_name last_name  age  math  english
0       koko      kaka   27    90       15
1       bobo      baba   25    14       97

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN
0       koko      kaka   27    90     15.0
1       bobo      baba   25    14     97.0

Part III : Columns¶

import numpy as np
import pandas as pd

# data and labels and pandas dataframes
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']
df = pd.DataFrame(data, columns = labels)

# Columns: Column to an array
print(df['last_name'].values)
print()
# Columns: Addition and Deletion
df['score'] = (df['math'] + df['english']) / 2.0
print(df)
print()
# del df['score']
df.pop('score') # the same as del df['score']
print(df)

['Sahli' 'Cooze' 'Baba' 'Bao' 'Joseph']

  first_name last_name  age  math  english  score
0       Simo     Sahli   42     4     25.0   14.5
1        Amy     Cooze   52    24     94.0   59.0
2        Ali      Baba   36    31     57.0   44.0
3    Jiarong       Bao   24    70     62.0   66.0
4      Jacob    Joseph   73     3      NaN    NaN

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

import numpy as np
import pandas as pd

# data and labels and pandas dataframe
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Simo'   , 'Koko' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']
df = pd.DataFrame(data, columns = labels)

# Columns: Sorting by values
df = df.sort_values(by='math')
print(df)
print("--------------")
df = df.sort_values(by=['first_name','last_name'])
print(df)
print("--------------")

  first_name last_name  age  math  english
4      Jacob    Joseph   73     3      NaN
0       Simo     Sahli   42     4     25.0
1       Simo      Koko   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
--------------
  first_name last_name  age  math  english
2        Ali      Baba   36    31     57.0
4      Jacob    Joseph   73     3      NaN
3    Jiarong       Bao   24    70     62.0
1       Simo      Koko   52    24     94.0
0       Simo     Sahli   42     4     25.0
--------------

import numpy as np
import pandas as pd

# data and labels and pandas dataframe
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']
df = pd.DataFrame(data, columns = labels)

# Columns: Sorting by index
df = df.sort_index(ascending=False)
print(df)
print("--------------")
df = df.sort_index(axis=0)
print(df)
print("--------------")
df = df.sort_index(axis=1)
print(df)
print("--------------")

  first_name last_name  age  math  english
4      Jacob    Joseph   73     3      NaN
3    Jiarong       Bao   24    70     62.0
2        Ali      Baba   36    31     57.0
1        Amy     Cooze   52    24     94.0
0       Simo     Sahli   42     4     25.0
--------------
  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN
--------------
   age  english first_name last_name  math
0   42     25.0       Simo     Sahli     4
1   52     94.0        Amy     Cooze    24
2   36     57.0        Ali      Baba    31
3   24     62.0    Jiarong       Bao    70
4   73      NaN      Jacob    Joseph     3
--------------

Part IV : Rows¶

import numpy as np
import pandas as pd

# data and labels and pandas dataframe
#....


# Rows:
print(df[-4:-2].values)
print()
print(df.iloc[2])
print()
print(df.loc[2])
print()
print(df.loc[2].values)
print()
df = df.drop(2) # deleting row at index 2
print(df)

[['Amy' 'Cooze' 52 24 94.0]
 ['Ali' 'Baba' 36 31 57.0]]

first_name     Ali
last_name     Baba
age             36
math            31
english         57
Name: 2, dtype: object

first_name     Ali
last_name     Baba
age             36
math            31
english         57
Name: 2, dtype: object

['Ali' 'Baba' 36 31 57.0]

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

Part V : Loops¶

import numpy as np
import pandas as pd

# data and labels and pandas dataframe
#....

# Loop for columns:
for a,b in df.iteritems():
   print(a)
   print()
   print(b)


# Loop for rows:
for a,b in df.iterrows():
   print(a)
   print()
   print(b)

first_name

0       Simo
1        Amy
3    Jiarong
4      Jacob
Name: first_name, dtype: object
last_name

0     Sahli
1     Cooze
3       Bao
4    Joseph
Name: last_name, dtype: object
age

0    42
1    52
3    24
4    73
Name: age, dtype: int64
math

0     4
1    24
3    70
4     3
Name: math, dtype: int64
english

0    25.0
1    94.0
3    62.0
4     NaN
Name: english, dtype: float64
0

first_name     Simo
last_name     Sahli
age              42
math              4
english          25
Name: 0, dtype: object
1

first_name      Amy
last_name     Cooze
age              52
math             24
english          94
Name: 1, dtype: object
3

first_name    Jiarong
last_name         Bao
age                24
math               70
english            62
Name: 3, dtype: object
4

first_name     Jacob
last_name     Joseph
age               73
math               3
english          NaN
Name: 4, dtype: object

Part VI: Files¶

Read CSV 1/2¶

import numpy as np
import pandas as pd

labels= ['first_name','last_name','age','math','english']

df = pd.read_csv("datas/test.csv",sep=',',header=None)
df.columns = labels
print(df)

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

Read CSV 2/2¶

import numpy as np
import pandas as pd

df = pd.read_csv("datas/test.csv",sep=',',header=0)
print(df)

      Simo   Sahli  42   4    25
0      Amy   Cooze  52  24  94.0
1      Ali    Baba  36  31  57.0
2  Jiarong     Bao  24  70  62.0
3    Jacob  Joseph  73   3   NaN

Write to CSV¶

import numpy as np
import pandas as pd

df = pd.read_csv("datas/test.csv",sep=',',header=0)
df = df.drop(2)

df.to_csv("datas/test3.csv", encoding='utf-8', index=False)

Part VII : Lambda¶

lambda 1/2¶

def my_sum(a,b):
	return a + b

s = lambda x, y : x + y

print(my_sum(5,2))
print()
print(s(5,2))

7

7

lambda 2/2¶

def myfunc(n):
  return lambda a : a * n

mydoubler = myfunc(2)
mytripler = myfunc(3)

print(mydoubler(5))
print()
print(mydoubler(11))
print()
print(mytripler(5))
print()
print(mytripler(11))

10

22

15

33

Part VIII : Groupby¶

import numpy as np
import pandas as pd

df = pd.read_csv("datas/ranks.csv", sep=' ', header=0)


print(df)
print("--------------------")


a = df.groupby('Team')
b = df.groupby(['Team','Year'])
print(type(a.groups))
print(type(b.groups))

print("--------------------")
for x,y in a.groups.items():
    print(x,"\t",list(y))

    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
2      863     2  Devils  2014
3      673     3  Devils  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
9      701     4  Royals  2014
10     804     1  Royals  2015
11     690     2  Riders  2017
--------------------
<class 'dict'>
<class 'dict'>
--------------------
Devils 	 [2, 3]
Kings 	 [4, 5, 6, 7]
Riders 	 [0, 1, 8, 11]
Royals 	 [9, 10]

Groupby: Select¶

import numpy as np
import pandas as pd

df = pd.read_csv("datas/ranks.csv", sep=' ', header=0)


print(df)
print("--------------------")


a = df.groupby('Year')
print(a.get_group(2014))
print("--------------------")
a = df.groupby(['Team','Year'])
print(a.get_group(('Riders',2014)))

    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
2      863     2  Devils  2014
3      673     3  Devils  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
9      701     4  Royals  2014
10     804     1  Royals  2015
11     690     2  Riders  2017
--------------------
   Points  Rank    Team  Year
0     876     1  Riders  2014
2     863     2  Devils  2014
4     741     3   Kings  2014
9     701     4  Royals  2014
--------------------
   Points  Rank    Team  Year
0     876     1  Riders  2014

Groupby: Aggregation¶

import numpy as np
import pandas as pd

df = pd.read_csv("datas/ranks.csv", sep=' ', header=0)


print(df)
print("--------------------")


a = df.groupby('Year')
print(a['Points'].agg(np.mean))
print()
print(a['Points'].agg(np.sum))
print()
print(a['Points'].agg(np.size))
print()
print(a['Points'].agg([np.sum, np.mean, np.std]))

    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
2      863     2  Devils  2014
3      673     3  Devils  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
9      701     4  Royals  2014
10     804     1  Royals  2015
11     690     2  Riders  2017
--------------------
Year
2014    795.25
2015    769.50
2016    725.00
2017    739.00
Name: Points, dtype: float64

Year
2014    3181
2015    3078
2016    1450
2017    1478
Name: Points, dtype: int64

Year
2014    4
2015    4
2016    2
2017    2
Name: Points, dtype: int64

       sum    mean        std
Year
2014  3181  795.25  87.439026
2015  3078  769.50  65.035888
2016  1450  725.00  43.840620
2017  1478  739.00  69.296465

Groupby: Filtering¶

import numpy as np
import pandas as pd

df = pd.read_csv("datas/ranks.csv", sep=' ', header=0)


print(df)
print("--------------------")


a = df.groupby('Team')
f = lambda x: len(x) >= 3
print(a.filter(f))

    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
2      863     2  Devils  2014
3      673     3  Devils  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
9      701     4  Royals  2014
10     804     1  Royals  2015
11     690     2  Riders  2017
--------------------
    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
11     690     2  Riders  2017

Part Ⅸ : Merging/Joining¶

import numpy as np
import pandas as pd

df1= pd.read_csv("datas/A.csv", sep=';', header=0)
df2= pd.read_csv("datas/B.csv", sep=';', header=0)
print(df1,"\n--------------------")
print(df2,"\n--------------------")

df = pd.merge(df1,df2,on='id')
print(df,"\n--------------------")
df = pd.merge(df1,df2,on=['id','subject'])
print(df,"\n--------------------")

df=pd.merge(df1, df2, on='subject', how='left')
print(df,"\n--------------------")
df=pd.merge(df1, df2, on='subject', how='right')
print(df,"\n--------------------")
df=pd.merge(df1, df2, on='subject', how='inner')
print(df,"\n--------------------")
df=pd.merge(df1, df2, on='subject', how='outer')
print(df,"\n--------------------")

     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
--------------------
    Name  id subject
0  Billy   1    sub2
1  Brian   2    sub4
2   Bran   3    sub3
3  Bryce   4    sub6
4  Betty   5    sub5
--------------------
   Name_x  id subject_x Name_y subject_y
0    Alex   1      sub1  Billy      sub2
1     Amy   2      sub2  Brian      sub4
2   Allen   3      sub4   Bran      sub3
3   Alice   4      sub6  Bryce      sub6
4  Ayoung   5      sub5  Betty      sub5
--------------------
   Name_x  id subject Name_y
0   Alice   4    sub6  Bryce
1  Ayoung   5    sub5  Betty
--------------------
   Name_x  id_x subject Name_y  id_y
0    Alex     1    sub1    NaN   NaN
1     Amy     2    sub2  Billy   1.0
2   Allen     3    sub4  Brian   2.0
3   Alice     4    sub6  Bryce   4.0
4  Ayoung     5    sub5  Betty   5.0
--------------------
   Name_x  id_x subject Name_y  id_y
0     Amy   2.0    sub2  Billy     1
1   Allen   3.0    sub4  Brian     2
2   Alice   4.0    sub6  Bryce     4
3  Ayoung   5.0    sub5  Betty     5
4     NaN   NaN    sub3   Bran     3
--------------------
   Name_x  id_x subject Name_y  id_y
0     Amy     2    sub2  Billy     1
1   Allen     3    sub4  Brian     2
2   Alice     4    sub6  Bryce     4
3  Ayoung     5    sub5  Betty     5
--------------------
   Name_x  id_x subject Name_y  id_y
0    Alex   1.0    sub1    NaN   NaN
1     Amy   2.0    sub2  Billy   1.0
2   Allen   3.0    sub4  Brian   2.0
3   Alice   4.0    sub6  Bryce   4.0
4  Ayoung   5.0    sub5  Betty   5.0
5     NaN   NaN    sub3   Bran   3.0
--------------------

Part X : Concatenation¶

import numpy as np
import pandas as pd

df1= pd.read_csv("datas/A.csv", sep=';', header=0)
df2= pd.read_csv("datas/B.csv", sep=';', header=0)
print(df1,"\n--------------------")
print(df2,"\n--------------------")

df = pd.concat([df1,df2],axis=0)
print(df,"\n--------------------")
df = pd.concat([df1,df2],axis=0,ignore_index=True)
print(df,"\n--------------------")

df = pd.concat([df1,df2],axis=1)
print(df,"\n--------------------")

df = df1.append(df2)
print(df,"\n--------------------")

     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
--------------------
    Name  id subject
0  Billy   1    sub2
1  Brian   2    sub4
2   Bran   3    sub3
3  Bryce   4    sub6
4  Betty   5    sub5
--------------------
     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
0   Billy   1    sub2
1   Brian   2    sub4
2    Bran   3    sub3
3   Bryce   4    sub6
4   Betty   5    sub5
--------------------
     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
5   Billy   1    sub2
6   Brian   2    sub4
7    Bran   3    sub3
8   Bryce   4    sub6
9   Betty   5    sub5
--------------------
     Name  id subject   Name  id subject
0    Alex   1    sub1  Billy   1    sub2
1     Amy   2    sub2  Brian   2    sub4
2   Allen   3    sub4   Bran   3    sub3
3   Alice   4    sub6  Bryce   4    sub6
4  Ayoung   5    sub5  Betty   5    sub5
--------------------
     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
0   Billy   1    sub2
1   Brian   2    sub4
2    Bran   3    sub3
3   Bryce   4    sub6
4   Betty   5    sub5
--------------------

Part Ⅺ ：Exercises¶

Exercise 1:¶

Read the file tips.csv and print it.

import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df)

    bill   tip     sex smoker  day    time  size
0  16.99  1.01  Female     No  Sun  Dinner     2
1  10.34  1.66    Male    Yes  Sat   Lunch     3
2  21.01  3.50    Male     No  Fri  Dinner     3
3  23.68  3.31    Male     No  Sun   Lunch     2
4  24.59  3.61  Female    Yes  Sun  Dinner     4

Exercise 2:¶

Print only the first 3 rows in tips.csv.

import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df[:3])

    bill   tip     sex smoker  day    time  size
0  16.99  1.01  Female     No  Sun  Dinner     2
1  10.34  1.66    Male    Yes  Sat   Lunch     3
2  21.01  3.50    Male     No  Fri  Dinner     3

Exercise 3:¶

Print only the first 3 rows in tips.csv and the columns bill, sex, size.
Hint: you can use df[['bill','sex', 'size']]

import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df[:3][['bill','sex', 'size']])

    bill     sex  size
0  16.99  Female     2
1  10.34    Male     3
2  21.01    Male     3

Exercise 4:¶

Print only the last 3 rows in tips.csv and the columns bill, sex, size.

import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df[-3:][['bill','sex', 'size']])

    bill     sex  size
2  21.01    Male     3
3  23.68    Male     2
4  24.59  Female     4

Exercise 5:¶

Print only those whose time is 'Dinner'.

import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
a = df.groupby('time')
print(a.get_group('Dinner'))

    bill   tip     sex smoker  day    time  size
0  16.99  1.01  Female     No  Sun  Dinner     2
2  21.01  3.50    Male     No  Fri  Dinner     3
4  24.59  3.61  Female    Yes  Sun  Dinner     4

Exercise 6:¶

Print only those whose bill is less than 20.

import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df[df["bill"] < 20])

    bill   tip     sex smoker  day    time  size
0  16.99  1.01  Female     No  Sun  Dinner     2
1  10.34  1.66    Male    Yes  Sat   Lunch     3

Exercise 7:¶

Group by 'sex' and print the result.

import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
a = df.groupby('sex')

for x,y in a.groups.items():
    print(x,"\t",list(y))

Female 	 [0, 4]
Male 	 [1, 2, 3]

Exercise 8:¶

Draw a graph using pandas and opencv for the file shown on the right.

import sys
import cv2
import numpy as np
sys.path.append("..")
import pandas as pd
from matplotlib import pyplot as plt

def horizontal_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(0,int(y)),(int(x),int(y)),color,1)

def vertical_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(int(x),0),(int(x),int(y)),color,1)


# make the background
img = np.zeros((512,1024,3),np.uint8)
x_max = img.shape[1]
y_max = img.shape[0]
img1 = vertical_line(img,x_max/2,y_max)
img2 = horizontal_line(img1,x_max,y_max/2)

A = []
B = []
pts = []

# put Anomaly values into an array
df = pd.read_csv("datas/temperature.csv",sep=';',header=0)
df = df.sort_values(by='Year')
A = np.append(A,df['Anomaly'].values)
B = np.append(B,df['Year'].values)
n = A.shape[0]

# calculate the points condinates
for i in range(n):
    pts.append([x_max/2+(i*30),y_max/2-A[i]*100])
pts = np.array(pts, np.int32)
print(pts)

# draw the polyline
for i in range(n):
    img = cv2.putText(img2,str(int(B[i])), (pts[i][0],int(y_max/2)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.3, (255,255,255), 1, cv2.LINE_AA)
img = cv2.polylines(img,[pts],False,(0,255,255))
img = img[:int(y_max/2+20),int(x_max/2):,:]

plt.imshow(img)
plt.title('my picture')
plt.show()

[[512 191]
 [542 192]
 [572 186]
 [602 190]
 [632 182]
 [662 166]
 [692 162]
 [722 172]
 [752 179]
 [782 163]]

Exercise 9:¶

Draw a graph using pandas and opencv for the file shown on the right.

import sys
import cv2
import numpy as np
sys.path.append("..")
import pandas as pd
from matplotlib import pyplot as plt

def horizontal_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(0,int(y)),(int(x),int(y)),color,1)

def vertical_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(int(x),0),(int(x),int(y)),color,1)


# make the background
img = np.zeros((512,800,3),np.uint8)+255
x_max = img.shape[1]
y_max = img.shape[0]
img1 = vertical_line(img,x_max/2,y_max)
img2 = horizontal_line(img1,x_max,y_max/2)

A = []
B = []
pts = []
pts2 = []
# put Anomaly values into an array
df = pd.read_csv("datas/temperature.csv",sep=';',header=0)
df = df.sort_values(by='Year')
A = np.append(A,df['Anomaly'].values)
B = np.append(B,df['Year'].values)
n = A.shape[0]

# calculate the points condinates
for i in range(n):
    pts.append([x_max/2+(i*30)+10,y_max/2-A[i]*100])
pts = np.array(pts, np.int32)

for i in range(n):
    pts2.append([x_max/2+(i*30)+30,y_max/2])
pts2 = np.array(pts2, np.int32)

# draw the rectangle
for i in range(n):
    img = cv2.putText(img2,str(int(B[i])), (pts2[i][0]-20,pts2[i][1]+10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.3, (0,0,0), 1, cv2.LINE_AA)

for i in range(n):
    img = cv2.rectangle(img,tuple(pts[i]),tuple(pts2[i]),(255,0,0),-1)
img = img[:int(y_max/2+20),int(x_max/2):,:]

plt.imshow(img)
plt.title('my picture')
plt.show()

Exercise 10:¶

Draw a graph using pandas and opencv for the file shown on the right.

import sys
import cv2
import numpy as np
sys.path.append("..")
import pandas as pd
import math
from matplotlib import pyplot as plt

def horizontal_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(0,int(y)),(int(x),int(y)),color,1)

def vertical_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(int(x),0),(int(x),int(y)),color,1)


# make the background
img = np.zeros((512,512,3),np.uint8)#+255
x_max = img.shape[1]
y_max = img.shape[0]
img1 = vertical_line(img,x_max/2,y_max)
img2 = horizontal_line(img1,x_max,y_max/2)

A = np.array([])
B = np.array([])
C = np.array([])
D = np.array([])
E = np.array([])
F = np.array([])
angle = np.array([])
start = np.array([])

x = int(x_max/2)
y = int(y_max/2)
r = 200
# put Anomaly values into an array
df = pd.read_csv("datas/energy.csv",sep=':',header=0)
df = df.sort_values(by='Resource')
A = np.append(A,df['Resource'].values)
B = np.append(B,df['Percentages'].values)
C = np.append(C,df['Rate'].values)
n = B.shape[0]

# calculate the angles
for i in range(n):
    angle = np.append(angle,(int(B[i]*360)))
    start = np.append(start,angle[:i].sum())
start = np.append(start,360)
m = start.shape[0]

# draw the ellipses
for i in range(m-1):
    end = start[i+1]
    color = (255-30*i,255-i*20,i*40)
    img = cv2.ellipse(img2,(x,y),(r,r),0,start[i],end,color,-1)

# get angle to calculate the middle points to put text
for i in range(m-1):
    D = np.append(D,start[i]+(int(start[i+1]-start[i])/2))

# mark points
for i in range(n):
    a = x + int(np.cos(D[i]/180 * np.pi)*r)    #:X
    b = y + int(np.sin(D[i]/180 * np.pi)*r)    #:Y
    E = np.append(E,a)
    F = np.append(F,b)
    img = cv2.circle(img,(a,b), 1, (0,0,0), -1)

# draw lines and put texts
for i in range(n):
    if D[i]>= 90 and D[i]<= 270:
        img = cv2.line(img,(int(E[i]),int(F[i])),(int(x-r-50),int(F[i])),(255,255,255),1)
        img = cv2.putText(img,A[i],(int(x-r-50),int(F[i]-10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
        img = cv2.putText(img,C[i],(int(x-r-50),int(F[i]+15)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
    else:
        img = cv2.line(img,(int(E[i]),int(F[i])),(int(x+r+50),int(F[i])),(255,255,255),1)
        img = cv2.putText(img,A[i],(int(x+r+25),int(F[i]-10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
        img = cv2.putText(img,C[i],(int(x+r+25),int(F[i]+15)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
plt.imshow(img)
plt.title('my picture')
plt.show()

Exercise 11:¶

Draw a graph using pandas and opencv for the file shown on the right.

import sys
import cv2
import numpy as np
sys.path.append("..")
import pandas as pd
import math
from matplotlib import pyplot as plt

def horizontal_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(0,int(y)),(int(x),int(y)),color,1)

def vertical_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(int(x),0),(int(x),int(y)),color,1)


# make the background
img = np.zeros((512,512,3),np.uint8)#+255
x_max = img.shape[1]
y_max = img.shape[0]
img1 = vertical_line(img,x_max/2,y_max)
img2 = horizontal_line(img1,x_max,y_max/2)

A = np.array([])
B = np.array([])
C = np.array([])
D = np.array([])
E = np.array([])
F = np.array([])
angle = np.array([])
start = np.array([])

x = int(x_max/2)
y = int(y_max/2)
r = 200
# put Anomaly values into an array
df = pd.read_csv("datas/energy.csv",sep=':',header=0)
df = df.sort_values(by='Resource')
A = np.append(A,df['Resource'].values)
B = np.append(B,df['Percentages'].values)
C = np.append(C,df['Rate'].values)
n = B.shape[0]

# calculate the angles
for i in range(n):
    angle = np.append(angle,(int(B[i]*360)))
    start = np.append(start,angle[:i].sum())
start = np.append(start,360)
m = start.shape[0]

# draw the ellipses
for i in range(m-1):
    end = start[i+1]
    color = (255-30*i,255-i*20,i*40)
    img = cv2.ellipse(img2,(x,y),(r,r),0,start[i],end,color,-1)

# creat another circle in the middle
img = cv2.circle(img,(x,y), 100, (0,0,0), -1)

# get angle to calculate the middle points to put text
for i in range(m-1):
    D = np.append(D,start[i]+(int(start[i+1]-start[i])/2))

# mark points
for i in range(n):
    a = x + int(np.cos(D[i]/180*math.pi)*r)    #:X
    b = y + int(np.sin(D[i]/180*math.pi)*r)    #:Y
    E = np.append(E,a)
    F = np.append(F,b)
    img = cv2.circle(img,(a,b), 1, (0,0,0), -1)

# draw lines and put texts
for i in range(n):
    if D[i]>= 90 and D[i]<= 270:
        img = cv2.line(img,(int(E[i]),int(F[i])),(int(x-r-50),int(F[i])),(255,255,255),1)
        img = cv2.putText(img,A[i],(int(x-r-50),int(F[i]-10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
        img = cv2.putText(img,C[i],(int(x-r-50),int(F[i]+15)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
    else:
        img = cv2.line(img,(int(E[i]),int(F[i])),(int(x+r+50),int(F[i])),(255,255,255),1)
        img = cv2.putText(img,A[i],(int(x+r+25),int(F[i]-10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
        img = cv2.putText(img,C[i],(int(x+r+25),int(F[i]+15)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
plt.imshow(img)
plt.title('my picture')
plt.show()