Pandas Tutorials


Part I : Basics

Basics1: Quickstart

In [1]:
import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24, 70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframe
df = pd.DataFrame(data, columns = labels)

# Others
print(df)
  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

Basics2 : table

In [3]:
import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframes
df = pd.DataFrame(data, columns = labels)
B  = pd.DataFrame()	# an empty dataframe

# dealing with tables
print(df)
print()
print(df.empty)
print()
print(B.empty)
print()
print(df.ndim)
print()
print(df.size)
print()
print(df.shape)
print()
print(df.dtypes)
  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

False

True

2

25

(5, 5)

first_name     object
last_name      object
age             int64
math            int64
english       float64
dtype: object

Basics3 : Rows

In [2]:
import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframes
df = pd.DataFrame(data, columns = labels)

# Dealing with rows
print(df[:2])
print()
print(df[:-2])
print()
print(df[2:4])
print()
print(df[-4:-2])
  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0

  first_name last_name  age  math  english
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0

  first_name last_name  age  math  english
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0

Basics4 : Columns

In [4]:
import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframes
df = pd.DataFrame(data, columns = labels)


# Dealing with columns
print(df['last_name'])
print()
print(df['last_name'][2:])
print()
print(df['last_name'][-2:])
print()
print(df['last_name'][2:4])
print()
print(df['last_name'][-4:-2])
0     Sahli
1     Cooze
2      Baba
3       Bao
4    Joseph
Name: last_name, dtype: object

2      Baba
3       Bao
4    Joseph
Name: last_name, dtype: object

3       Bao
4    Joseph
Name: last_name, dtype: object

2    Baba
3     Bao
Name: last_name, dtype: object

1    Cooze
2     Baba
Name: last_name, dtype: object

Basics5 : Items

In [5]:
import numpy as np
import pandas as pd

# data and labels
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels= ['first_name','last_name','age','math','english']

# pandas dataframes
df = pd.DataFrame(data, columns = labels)


# Dealing with items
print(df['age'][0])
print()
print(df['age'][3])
print()
print(df['age'].min())
print()
print(df['age'].max())
print()
print(df['age'].mean())
print()
print(df['age'].sum())
42

24

24

73

45.4

227

Part II : Tables

In [6]:
import numpy as np
import pandas as pd

# data and labels and pandas dataframes
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']

df = pd.DataFrame(data, columns = labels)


# Tables
print(df.values)	# convert Table to a Matrix
print()
print(df.head(2))
print()
print(df.tail(2))
print()
B = df.T			# transpose of the table
print(B)
[['Simo' 'Sahli' 42 4 25.0]
 ['Amy' 'Cooze' 52 24 94.0]
 ['Ali' 'Baba' 36 31 57.0]
 ['Jiarong' 'Bao' 24 70 62.0]
 ['Jacob' 'Joseph' 73 3 nan]]

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0

  first_name last_name  age  math  english
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

                0      1     2        3       4
first_name   Simo    Amy   Ali  Jiarong   Jacob
last_name   Sahli  Cooze  Baba      Bao  Joseph
age            42     52    36       24      73
math            4     24    31       70       3
english        25     94    57       62     NaN
In [6]:
import numpy as np
import pandas as pd

# data and labels and pandas dataframes
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']

data2 = [('koko', 'kaka' , 27, 90, 15),
         ('bobo', 'baba' , 25, 14, 97)]
df = pd.DataFrame(data, columns = labels)
df2= pd.DataFrame(data2, columns = labels)

# Tables Concatenation
print(df)
print()
print(df2)
df = df.append(df2)
print()
print(df)
  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

  first_name last_name  age  math  english
0       koko      kaka   27    90       15
1       bobo      baba   25    14       97

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN
0       koko      kaka   27    90     15.0
1       bobo      baba   25    14     97.0

Part III : Columns

In [7]:
import numpy as np
import pandas as pd

# data and labels and pandas dataframes
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']
df = pd.DataFrame(data, columns = labels)

# Columns: Column to an array
print(df['last_name'].values)
print()
# Columns: Addition and Deletion
df['score'] = (df['math'] + df['english']) / 2.0
print(df)
print()
# del df['score']
df.pop('score') # the same as del df['score']
print(df)
['Sahli' 'Cooze' 'Baba' 'Bao' 'Joseph']

  first_name last_name  age  math  english  score
0       Simo     Sahli   42     4     25.0   14.5
1        Amy     Cooze   52    24     94.0   59.0
2        Ali      Baba   36    31     57.0   44.0
3    Jiarong       Bao   24    70     62.0   66.0
4      Jacob    Joseph   73     3      NaN    NaN

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN
In [15]:
import numpy as np
import pandas as pd

# data and labels and pandas dataframe
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Simo'   , 'Koko' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']
df = pd.DataFrame(data, columns = labels)

# Columns: Sorting by values
df = df.sort_values(by='math')
print(df)
print("--------------")
df = df.sort_values(by=['first_name','last_name'])
print(df)
print("--------------")
  first_name last_name  age  math  english
4      Jacob    Joseph   73     3      NaN
0       Simo     Sahli   42     4     25.0
1       Simo      Koko   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
--------------
  first_name last_name  age  math  english
2        Ali      Baba   36    31     57.0
4      Jacob    Joseph   73     3      NaN
3    Jiarong       Bao   24    70     62.0
1       Simo      Koko   52    24     94.0
0       Simo     Sahli   42     4     25.0
--------------
In [16]:
import numpy as np
import pandas as pd

# data and labels and pandas dataframe
data = [('Simo'   , 'Sahli' , 42,  4, 25),
        ('Amy'    , 'Cooze' , 52, 24, 94),
        ('Ali'    , 'Baba'  , 36, 31, 57),
        ('Jiarong', 'Bao'   , 24,  70, 62),
        ('Jacob'  , 'Joseph', 73,  3, np.nan)]
labels = ['first_name','last_name','age','math','english']
df = pd.DataFrame(data, columns = labels)

# Columns: Sorting by index
df = df.sort_index(ascending=False)
print(df)
print("--------------")
df = df.sort_index(axis=0)
print(df)
print("--------------")
df = df.sort_index(axis=1)
print(df)
print("--------------")
  first_name last_name  age  math  english
4      Jacob    Joseph   73     3      NaN
3    Jiarong       Bao   24    70     62.0
2        Ali      Baba   36    31     57.0
1        Amy     Cooze   52    24     94.0
0       Simo     Sahli   42     4     25.0
--------------
  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN
--------------
   age  english first_name last_name  math
0   42     25.0       Simo     Sahli     4
1   52     94.0        Amy     Cooze    24
2   36     57.0        Ali      Baba    31
3   24     62.0    Jiarong       Bao    70
4   73      NaN      Jacob    Joseph     3
--------------

Part IV : Rows

In [8]:
import numpy as np
import pandas as pd

# data and labels and pandas dataframe
#....


# Rows:
print(df[-4:-2].values)
print()
print(df.iloc[2])
print()
print(df.loc[2])
print()
print(df.loc[2].values)
print()
df = df.drop(2) # deleting row at index 2
print(df)
[['Amy' 'Cooze' 52 24 94.0]
 ['Ali' 'Baba' 36 31 57.0]]

first_name     Ali
last_name     Baba
age             36
math            31
english         57
Name: 2, dtype: object

first_name     Ali
last_name     Baba
age             36
math            31
english         57
Name: 2, dtype: object

['Ali' 'Baba' 36 31 57.0]

  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

Part V : Loops

In [11]:
import numpy as np
import pandas as pd

# data and labels and pandas dataframe
#....

# Loop for columns:
for a,b in df.iteritems():
   print(a)
   print()
   print(b)


# Loop for rows:
for a,b in df.iterrows():
   print(a)
   print()
   print(b)
first_name

0       Simo
1        Amy
3    Jiarong
4      Jacob
Name: first_name, dtype: object
last_name

0     Sahli
1     Cooze
3       Bao
4    Joseph
Name: last_name, dtype: object
age

0    42
1    52
3    24
4    73
Name: age, dtype: int64
math

0     4
1    24
3    70
4     3
Name: math, dtype: int64
english

0    25.0
1    94.0
3    62.0
4     NaN
Name: english, dtype: float64
0

first_name     Simo
last_name     Sahli
age              42
math              4
english          25
Name: 0, dtype: object
1

first_name      Amy
last_name     Cooze
age              52
math             24
english          94
Name: 1, dtype: object
3

first_name    Jiarong
last_name         Bao
age                24
math               70
english            62
Name: 3, dtype: object
4

first_name     Jacob
last_name     Joseph
age               73
math               3
english          NaN
Name: 4, dtype: object

Part VI: Files

Read CSV 1/2

In [38]:
import numpy as np
import pandas as pd

labels= ['first_name','last_name','age','math','english']

df = pd.read_csv("datas/test.csv",sep=',',header=None)
df.columns = labels
print(df)
  first_name last_name  age  math  english
0       Simo     Sahli   42     4     25.0
1        Amy     Cooze   52    24     94.0
2        Ali      Baba   36    31     57.0
3    Jiarong       Bao   24    70     62.0
4      Jacob    Joseph   73     3      NaN

Read CSV 2/2

In [39]:
import numpy as np
import pandas as pd

df = pd.read_csv("datas/test.csv",sep=',',header=0)
print(df)
      Simo   Sahli  42   4    25
0      Amy   Cooze  52  24  94.0
1      Ali    Baba  36  31  57.0
2  Jiarong     Bao  24  70  62.0
3    Jacob  Joseph  73   3   NaN

Write to CSV

In [14]:
import numpy as np
import pandas as pd

df = pd.read_csv("datas/test.csv",sep=',',header=0)
df = df.drop(2)

df.to_csv("datas/test3.csv", encoding='utf-8', index=False)

Part VII : Lambda

lambda 1/2

In [15]:
def my_sum(a,b):
	return a + b

s = lambda x, y : x + y

print(my_sum(5,2))
print()
print(s(5,2))
7

7

lambda 2/2

In [16]:
def myfunc(n):
  return lambda a : a * n

mydoubler = myfunc(2)
mytripler = myfunc(3)

print(mydoubler(5))
print()
print(mydoubler(11))
print()
print(mytripler(5))
print()
print(mytripler(11))
10

22

15

33

Part VIII : Groupby

In [41]:
import numpy as np
import pandas as pd

df = pd.read_csv("datas/ranks.csv", sep=' ', header=0)


print(df)
print("--------------------")


a = df.groupby('Team')
b = df.groupby(['Team','Year'])
print(type(a.groups))
print(type(b.groups))

print("--------------------")
for x,y in a.groups.items():
    print(x,"\t",list(y))
    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
2      863     2  Devils  2014
3      673     3  Devils  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
9      701     4  Royals  2014
10     804     1  Royals  2015
11     690     2  Riders  2017
--------------------
<class 'dict'>
<class 'dict'>
--------------------
Devils 	 [2, 3]
Kings 	 [4, 5, 6, 7]
Riders 	 [0, 1, 8, 11]
Royals 	 [9, 10]

Groupby: Select

In [18]:
import numpy as np
import pandas as pd

df = pd.read_csv("datas/ranks.csv", sep=' ', header=0)


print(df)
print("--------------------")


a = df.groupby('Year')
print(a.get_group(2014))
print("--------------------")
a = df.groupby(['Team','Year'])
print(a.get_group(('Riders',2014)))
    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
2      863     2  Devils  2014
3      673     3  Devils  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
9      701     4  Royals  2014
10     804     1  Royals  2015
11     690     2  Riders  2017
--------------------
   Points  Rank    Team  Year
0     876     1  Riders  2014
2     863     2  Devils  2014
4     741     3   Kings  2014
9     701     4  Royals  2014
--------------------
   Points  Rank    Team  Year
0     876     1  Riders  2014

Groupby: Aggregation

In [43]:
import numpy as np
import pandas as pd

df = pd.read_csv("datas/ranks.csv", sep=' ', header=0)


print(df)
print("--------------------")


a = df.groupby('Year')
print(a['Points'].agg(np.mean))
print()
print(a['Points'].agg(np.sum))
print()
print(a['Points'].agg(np.size))
print()
print(a['Points'].agg([np.sum, np.mean, np.std]))
    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
2      863     2  Devils  2014
3      673     3  Devils  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
9      701     4  Royals  2014
10     804     1  Royals  2015
11     690     2  Riders  2017
--------------------
Year
2014    795.25
2015    769.50
2016    725.00
2017    739.00
Name: Points, dtype: float64

Year
2014    3181
2015    3078
2016    1450
2017    1478
Name: Points, dtype: int64

Year
2014    4
2015    4
2016    2
2017    2
Name: Points, dtype: int64

       sum    mean        std
Year
2014  3181  795.25  87.439026
2015  3078  769.50  65.035888
2016  1450  725.00  43.840620
2017  1478  739.00  69.296465

Groupby: Filtering

In [44]:
import numpy as np
import pandas as pd

df = pd.read_csv("datas/ranks.csv", sep=' ', header=0)


print(df)
print("--------------------")


a = df.groupby('Team')
f = lambda x: len(x) >= 3
print(a.filter(f))
    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
2      863     2  Devils  2014
3      673     3  Devils  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
9      701     4  Royals  2014
10     804     1  Royals  2015
11     690     2  Riders  2017
--------------------
    Points  Rank    Team  Year
0      876     1  Riders  2014
1      789     2  Riders  2015
4      741     3   Kings  2014
5      812     4   Kings  2015
6      756     1   Kings  2016
7      788     1   Kings  2017
8      694     2  Riders  2016
11     690     2  Riders  2017

Part Ⅸ : Merging/Joining

In [45]:
import numpy as np
import pandas as pd

df1= pd.read_csv("datas/A.csv", sep=';', header=0)
df2= pd.read_csv("datas/B.csv", sep=';', header=0)
print(df1,"\n--------------------")
print(df2,"\n--------------------")

df = pd.merge(df1,df2,on='id')
print(df,"\n--------------------")
df = pd.merge(df1,df2,on=['id','subject'])
print(df,"\n--------------------")

df=pd.merge(df1, df2, on='subject', how='left')
print(df,"\n--------------------")
df=pd.merge(df1, df2, on='subject', how='right')
print(df,"\n--------------------")
df=pd.merge(df1, df2, on='subject', how='inner')
print(df,"\n--------------------")
df=pd.merge(df1, df2, on='subject', how='outer')
print(df,"\n--------------------")
     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
--------------------
    Name  id subject
0  Billy   1    sub2
1  Brian   2    sub4
2   Bran   3    sub3
3  Bryce   4    sub6
4  Betty   5    sub5
--------------------
   Name_x  id subject_x Name_y subject_y
0    Alex   1      sub1  Billy      sub2
1     Amy   2      sub2  Brian      sub4
2   Allen   3      sub4   Bran      sub3
3   Alice   4      sub6  Bryce      sub6
4  Ayoung   5      sub5  Betty      sub5
--------------------
   Name_x  id subject Name_y
0   Alice   4    sub6  Bryce
1  Ayoung   5    sub5  Betty
--------------------
   Name_x  id_x subject Name_y  id_y
0    Alex     1    sub1    NaN   NaN
1     Amy     2    sub2  Billy   1.0
2   Allen     3    sub4  Brian   2.0
3   Alice     4    sub6  Bryce   4.0
4  Ayoung     5    sub5  Betty   5.0
--------------------
   Name_x  id_x subject Name_y  id_y
0     Amy   2.0    sub2  Billy     1
1   Allen   3.0    sub4  Brian     2
2   Alice   4.0    sub6  Bryce     4
3  Ayoung   5.0    sub5  Betty     5
4     NaN   NaN    sub3   Bran     3
--------------------
   Name_x  id_x subject Name_y  id_y
0     Amy     2    sub2  Billy     1
1   Allen     3    sub4  Brian     2
2   Alice     4    sub6  Bryce     4
3  Ayoung     5    sub5  Betty     5
--------------------
   Name_x  id_x subject Name_y  id_y
0    Alex   1.0    sub1    NaN   NaN
1     Amy   2.0    sub2  Billy   1.0
2   Allen   3.0    sub4  Brian   2.0
3   Alice   4.0    sub6  Bryce   4.0
4  Ayoung   5.0    sub5  Betty   5.0
5     NaN   NaN    sub3   Bran   3.0
--------------------

Part X : Concatenation

In [46]:
import numpy as np
import pandas as pd

df1= pd.read_csv("datas/A.csv", sep=';', header=0)
df2= pd.read_csv("datas/B.csv", sep=';', header=0)
print(df1,"\n--------------------")
print(df2,"\n--------------------")

df = pd.concat([df1,df2],axis=0)
print(df,"\n--------------------")
df = pd.concat([df1,df2],axis=0,ignore_index=True)
print(df,"\n--------------------")

df = pd.concat([df1,df2],axis=1)
print(df,"\n--------------------")

df = df1.append(df2)
print(df,"\n--------------------")
     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
--------------------
    Name  id subject
0  Billy   1    sub2
1  Brian   2    sub4
2   Bran   3    sub3
3  Bryce   4    sub6
4  Betty   5    sub5
--------------------
     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
0   Billy   1    sub2
1   Brian   2    sub4
2    Bran   3    sub3
3   Bryce   4    sub6
4   Betty   5    sub5
--------------------
     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
5   Billy   1    sub2
6   Brian   2    sub4
7    Bran   3    sub3
8   Bryce   4    sub6
9   Betty   5    sub5
--------------------
     Name  id subject   Name  id subject
0    Alex   1    sub1  Billy   1    sub2
1     Amy   2    sub2  Brian   2    sub4
2   Allen   3    sub4   Bran   3    sub3
3   Alice   4    sub6  Bryce   4    sub6
4  Ayoung   5    sub5  Betty   5    sub5
--------------------
     Name  id subject
0    Alex   1    sub1
1     Amy   2    sub2
2   Allen   3    sub4
3   Alice   4    sub6
4  Ayoung   5    sub5
0   Billy   1    sub2
1   Brian   2    sub4
2    Bran   3    sub3
3   Bryce   4    sub6
4   Betty   5    sub5
--------------------

Part Ⅺ :Exercises

Exercise 1:

Read the file tips.csv and print it.

In [48]:
import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df)
    bill   tip     sex smoker  day    time  size
0  16.99  1.01  Female     No  Sun  Dinner     2
1  10.34  1.66    Male    Yes  Sat   Lunch     3
2  21.01  3.50    Male     No  Fri  Dinner     3
3  23.68  3.31    Male     No  Sun   Lunch     2
4  24.59  3.61  Female    Yes  Sun  Dinner     4

Exercise 2:

Print only the first 3 rows in tips.csv.

In [49]:
import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df[:3])
    bill   tip     sex smoker  day    time  size
0  16.99  1.01  Female     No  Sun  Dinner     2
1  10.34  1.66    Male    Yes  Sat   Lunch     3
2  21.01  3.50    Male     No  Fri  Dinner     3

Exercise 3:

Print only the first 3 rows in tips.csv and the columns bill, sex, size.
Hint: you can use df[['bill','sex', 'size']]

In [50]:
import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df[:3][['bill','sex', 'size']])
    bill     sex  size
0  16.99  Female     2
1  10.34    Male     3
2  21.01    Male     3

Exercise 4:

Print only the last 3 rows in tips.csv and the columns bill, sex, size.

In [51]:
import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df[-3:][['bill','sex', 'size']])
    bill     sex  size
2  21.01    Male     3
3  23.68    Male     2
4  24.59  Female     4

Exercise 5:

Print only those whose time is 'Dinner'.

In [52]:
import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
a = df.groupby('time')
print(a.get_group('Dinner'))
    bill   tip     sex smoker  day    time  size
0  16.99  1.01  Female     No  Sun  Dinner     2
2  21.01  3.50    Male     No  Fri  Dinner     3
4  24.59  3.61  Female    Yes  Sun  Dinner     4

Exercise 6:

Print only those whose bill is less than 20.

In [53]:
import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
print(df[df["bill"] < 20])
    bill   tip     sex smoker  day    time  size
0  16.99  1.01  Female     No  Sun  Dinner     2
1  10.34  1.66    Male    Yes  Sat   Lunch     3

Exercise 7:

Group by 'sex' and print the result.

In [54]:
import pandas as pd

df = pd.read_csv("datas/tips.csv",sep=':',header=0)
a = df.groupby('sex')

for x,y in a.groups.items():
    print(x,"\t",list(y))
Female 	 [0, 4]
Male 	 [1, 2, 3]

Exercise 8:

Draw a graph using pandas and opencv for the file shown on the right.

In [22]:
import sys
import cv2
import numpy as np
sys.path.append("..")
import pandas as pd
from matplotlib import pyplot as plt

def horizontal_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(0,int(y)),(int(x),int(y)),color,1)

def vertical_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(int(x),0),(int(x),int(y)),color,1)


# make the background
img = np.zeros((512,1024,3),np.uint8)
x_max = img.shape[1]
y_max = img.shape[0]
img1 = vertical_line(img,x_max/2,y_max)
img2 = horizontal_line(img1,x_max,y_max/2)

A = []
B = []
pts = []

# put Anomaly values into an array
df = pd.read_csv("datas/temperature.csv",sep=';',header=0)
df = df.sort_values(by='Year')
A = np.append(A,df['Anomaly'].values)
B = np.append(B,df['Year'].values)
n = A.shape[0]

# calculate the points condinates
for i in range(n):
    pts.append([x_max/2+(i*30),y_max/2-A[i]*100])
pts = np.array(pts, np.int32)
print(pts)

# draw the polyline
for i in range(n):
    img = cv2.putText(img2,str(int(B[i])), (pts[i][0],int(y_max/2)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.3, (255,255,255), 1, cv2.LINE_AA)
img = cv2.polylines(img,[pts],False,(0,255,255))
img = img[:int(y_max/2+20),int(x_max/2):,:]

plt.imshow(img)
plt.title('my picture')
plt.show()
[[512 191]
 [542 192]
 [572 186]
 [602 190]
 [632 182]
 [662 166]
 [692 162]
 [722 172]
 [752 179]
 [782 163]]

Exercise 9:

Draw a graph using pandas and opencv for the file shown on the right.

In [4]:
import sys
import cv2
import numpy as np
sys.path.append("..")
import pandas as pd
from matplotlib import pyplot as plt

def horizontal_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(0,int(y)),(int(x),int(y)),color,1)

def vertical_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(int(x),0),(int(x),int(y)),color,1)


# make the background
img = np.zeros((512,800,3),np.uint8)+255
x_max = img.shape[1]
y_max = img.shape[0]
img1 = vertical_line(img,x_max/2,y_max)
img2 = horizontal_line(img1,x_max,y_max/2)

A = []
B = []
pts = []
pts2 = []
# put Anomaly values into an array
df = pd.read_csv("datas/temperature.csv",sep=';',header=0)
df = df.sort_values(by='Year')
A = np.append(A,df['Anomaly'].values)
B = np.append(B,df['Year'].values)
n = A.shape[0]

# calculate the points condinates
for i in range(n):
    pts.append([x_max/2+(i*30)+10,y_max/2-A[i]*100])
pts = np.array(pts, np.int32)

for i in range(n):
    pts2.append([x_max/2+(i*30)+30,y_max/2])
pts2 = np.array(pts2, np.int32)

# draw the rectangle
for i in range(n):
    img = cv2.putText(img2,str(int(B[i])), (pts2[i][0]-20,pts2[i][1]+10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.3, (0,0,0), 1, cv2.LINE_AA)

for i in range(n):
    img = cv2.rectangle(img,tuple(pts[i]),tuple(pts2[i]),(255,0,0),-1)
img = img[:int(y_max/2+20),int(x_max/2):,:]

plt.imshow(img)
plt.title('my picture')
plt.show()

Exercise 10:

Draw a graph using pandas and opencv for the file shown on the right.

In [1]:
import sys
import cv2
import numpy as np
sys.path.append("..")
import pandas as pd
import math
from matplotlib import pyplot as plt

def horizontal_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(0,int(y)),(int(x),int(y)),color,1)

def vertical_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(int(x),0),(int(x),int(y)),color,1)


# make the background
img = np.zeros((512,512,3),np.uint8)#+255
x_max = img.shape[1]
y_max = img.shape[0]
img1 = vertical_line(img,x_max/2,y_max)
img2 = horizontal_line(img1,x_max,y_max/2)

A = np.array([])
B = np.array([])
C = np.array([])
D = np.array([])
E = np.array([])
F = np.array([])
angle = np.array([])
start = np.array([])

x = int(x_max/2)
y = int(y_max/2)
r = 200
# put Anomaly values into an array
df = pd.read_csv("datas/energy.csv",sep=':',header=0)
df = df.sort_values(by='Resource')
A = np.append(A,df['Resource'].values)
B = np.append(B,df['Percentages'].values)
C = np.append(C,df['Rate'].values)
n = B.shape[0]

# calculate the angles
for i in range(n):
    angle = np.append(angle,(int(B[i]*360)))
    start = np.append(start,angle[:i].sum())
start = np.append(start,360)
m = start.shape[0]

# draw the ellipses
for i in range(m-1):
    end = start[i+1]
    color = (255-30*i,255-i*20,i*40)
    img = cv2.ellipse(img2,(x,y),(r,r),0,start[i],end,color,-1)

# get angle to calculate the middle points to put text
for i in range(m-1):
    D = np.append(D,start[i]+(int(start[i+1]-start[i])/2))

# mark points
for i in range(n):
    a = x + int(np.cos(D[i]/180 * np.pi)*r)    #:X
    b = y + int(np.sin(D[i]/180 * np.pi)*r)    #:Y
    E = np.append(E,a)
    F = np.append(F,b)
    img = cv2.circle(img,(a,b), 1, (0,0,0), -1)

# draw lines and put texts
for i in range(n):
    if D[i]>= 90 and D[i]<= 270:
        img = cv2.line(img,(int(E[i]),int(F[i])),(int(x-r-50),int(F[i])),(255,255,255),1)
        img = cv2.putText(img,A[i],(int(x-r-50),int(F[i]-10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
        img = cv2.putText(img,C[i],(int(x-r-50),int(F[i]+15)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
    else:
        img = cv2.line(img,(int(E[i]),int(F[i])),(int(x+r+50),int(F[i])),(255,255,255),1)
        img = cv2.putText(img,A[i],(int(x+r+25),int(F[i]-10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
        img = cv2.putText(img,C[i],(int(x+r+25),int(F[i]+15)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
plt.imshow(img)
plt.title('my picture')
plt.show()

Exercise 11:

Draw a graph using pandas and opencv for the file shown on the right.

In [2]:
import sys
import cv2
import numpy as np
sys.path.append("..")
import pandas as pd
import math
from matplotlib import pyplot as plt

def horizontal_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(0,int(y)),(int(x),int(y)),color,1)

def vertical_line(zone,x,y,color = (255,255,255)):
    return cv2.line(zone,(int(x),0),(int(x),int(y)),color,1)


# make the background
img = np.zeros((512,512,3),np.uint8)#+255
x_max = img.shape[1]
y_max = img.shape[0]
img1 = vertical_line(img,x_max/2,y_max)
img2 = horizontal_line(img1,x_max,y_max/2)

A = np.array([])
B = np.array([])
C = np.array([])
D = np.array([])
E = np.array([])
F = np.array([])
angle = np.array([])
start = np.array([])

x = int(x_max/2)
y = int(y_max/2)
r = 200
# put Anomaly values into an array
df = pd.read_csv("datas/energy.csv",sep=':',header=0)
df = df.sort_values(by='Resource')
A = np.append(A,df['Resource'].values)
B = np.append(B,df['Percentages'].values)
C = np.append(C,df['Rate'].values)
n = B.shape[0]

# calculate the angles
for i in range(n):
    angle = np.append(angle,(int(B[i]*360)))
    start = np.append(start,angle[:i].sum())
start = np.append(start,360)
m = start.shape[0]

# draw the ellipses
for i in range(m-1):
    end = start[i+1]
    color = (255-30*i,255-i*20,i*40)
    img = cv2.ellipse(img2,(x,y),(r,r),0,start[i],end,color,-1)

# creat another circle in the middle
img = cv2.circle(img,(x,y), 100, (0,0,0), -1)

# get angle to calculate the middle points to put text
for i in range(m-1):
    D = np.append(D,start[i]+(int(start[i+1]-start[i])/2))

# mark points
for i in range(n):
    a = x + int(np.cos(D[i]/180*math.pi)*r)    #:X
    b = y + int(np.sin(D[i]/180*math.pi)*r)    #:Y
    E = np.append(E,a)
    F = np.append(F,b)
    img = cv2.circle(img,(a,b), 1, (0,0,0), -1)

# draw lines and put texts
for i in range(n):
    if D[i]>= 90 and D[i]<= 270:
        img = cv2.line(img,(int(E[i]),int(F[i])),(int(x-r-50),int(F[i])),(255,255,255),1)
        img = cv2.putText(img,A[i],(int(x-r-50),int(F[i]-10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
        img = cv2.putText(img,C[i],(int(x-r-50),int(F[i]+15)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
    else:
        img = cv2.line(img,(int(E[i]),int(F[i])),(int(x+r+50),int(F[i])),(255,255,255),1)
        img = cv2.putText(img,A[i],(int(x+r+25),int(F[i]-10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
        img = cv2.putText(img,C[i],(int(x+r+25),int(F[i]+15)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255,255,255), 1, cv2.LINE_AA)
plt.imshow(img)
plt.title('my picture')
plt.show()