作者:Chris Albon
译者:飞龙
协议:CC BY-NC-SA 4.0
# 导入模块
import pandas as pd
from IPython .display import display
from IPython .display import Image
raw_data = {
'subject_id' : ['1' , '2' , '3' , '4' , '5' ],
'first_name' : ['Alex' , 'Amy' , 'Allen' , 'Alice' , 'Ayoung' ],
'last_name' : ['Anderson' , 'Ackerman' , 'Ali' , 'Aoni' , 'Atiches' ]}
df_a = pd .DataFrame (raw_data , columns = ['subject_id' , 'first_name' , 'last_name' ])
df_a
subject_id
first_name
last_name
0
1
Alex
Anderson
1
2
Amy
Ackerman
2
3
Allen
Ali
3
4
Alice
Aoni
4
5
Ayoung
Atiches
# 创建第二个数据帧
raw_data = {
'subject_id' : ['4' , '5' , '6' , '7' , '8' ],
'first_name' : ['Billy' , 'Brian' , 'Bran' , 'Bryce' , 'Betty' ],
'last_name' : ['Bonder' , 'Black' , 'Balwner' , 'Brice' , 'Btisan' ]}
df_b = pd .DataFrame (raw_data , columns = ['subject_id' , 'first_name' , 'last_name' ])
df_b
subject_id
first_name
last_name
0
4
Billy
Bonder
1
5
Brian
Black
2
6
Bran
Balwner
3
7
Bryce
Brice
4
8
Betty
Btisan
# 创建第三个数据帧
raw_data = {
'subject_id' : ['1' , '2' , '3' , '4' , '5' , '7' , '8' , '9' , '10' , '11' ],
'test_id' : [51 , 15 , 15 , 61 , 16 , 14 , 15 , 1 , 61 , 16 ]}
df_n = pd .DataFrame (raw_data , columns = ['subject_id' ,'test_id' ])
df_n
subject_id
test_id
0
1
51
1
2
15
2
3
15
3
4
61
4
5
16
5
7
14
6
8
15
7
9
1
8
10
61
9
11
16
# 将两个数据帧按行连接
df_new = pd .concat ([df_a , df_b ])
df_new
subject_id
first_name
last_name
0
1
Alex
Anderson
1
2
Amy
Ackerman
2
3
Allen
Ali
3
4
Alice
Aoni
4
5
Ayoung
Atiches
0
4
Billy
Bonder
1
5
Brian
Black
2
6
Bran
Balwner
3
7
Bryce
Brice
4
8
Betty
Btisan
# 将两个数据帧按列连接
pd .concat ([df_a , df_b ], axis = 1 )
subject_id
first_name
last_name
subject_id
first_name
last_name
0
1
Alex
Anderson
4
Billy
Bonder
1
2
Amy
Ackerman
5
Brian
Black
2
3
Allen
Ali
6
Bran
Balwner
3
4
Alice
Aoni
7
Bryce
Brice
4
5
Ayoung
Atiches
8
Betty
Btisan
# 按两个数据帧按 subject_id 连接
pd .merge (df_new , df_n , on = 'subject_id' )
subject_id
first_name
last_name
test_id
0
1
Alex
Anderson
51
1
2
Amy
Ackerman
15
2
3
Allen
Ali
15
3
4
Alice
Aoni
61
4
4
Billy
Bonder
61
5
5
Ayoung
Atiches
16
6
5
Brian
Black
16
7
7
Bryce
Brice
14
8
8
Betty
Btisan
15
# 将两个数据帧按照左和右数据帧的 subject_id 连接
pd .merge (df_new , df_n , left_on = 'subject_id' , right_on = 'subject_id' )
subject_id
first_name
last_name
test_id
0
1
Alex
Anderson
51
1
2
Amy
Ackerman
15
2
3
Allen
Ali
15
3
4
Alice
Aoni
61
4
4
Billy
Bonder
61
5
5
Ayoung
Atiches
16
6
5
Brian
Black
16
7
7
Bryce
Brice
14
8
8
Betty
Btisan
15
使用外连接来合并。
“全外连接产生表 A 和表 B 中所有记录的集合,带有来自两侧的匹配记录。如果没有匹配,则缺少的一侧将包含空值。” -- [来源](http://blog .codinghorror.com/a-visual-explanation-of-sql-joins/)
pd .merge (df_a , df_b , on = 'subject_id' , how = 'outer' )
subject_id
first_name_x
last_name_x
first_name_y
last_name_y
0
1
Alex
Anderson
NaN
NaN
1
2
Amy
Ackerman
NaN
NaN
2
3
Allen
Ali
NaN
NaN
3
4
Alice
Aoni
Billy
Bonder
4
5
Ayoung
Atiches
Brian
Black
5
6
NaN
NaN
Bran
Balwner
6
7
NaN
NaN
Bryce
Brice
7
8
NaN
NaN
Betty
Btisan
使用内连接来合并。
“内联接只生成匹配表 A 和表 B 的记录集。” -- 来源
pd .merge (df_a , df_b , on = 'subject_id' , how = 'inner' )
subject_id
first_name_x
last_name_x
first_name_y
last_name_y
0
4
Alice
Aoni
Billy
Bonder
1
5
Ayoung
Atiches
Brian
Black
# 使用右连接来合并
pd .merge (df_a , df_b , on = 'subject_id' , how = 'right' )
subject_id
first_name_x
last_name_x
first_name_y
last_name_y
0
4
Alice
Aoni
Billy
Bonder
1
5
Ayoung
Atiches
Brian
Black
2
6
NaN
NaN
Bran
Balwner
3
7
NaN
NaN
Bryce
Brice
4
8
NaN
NaN
Betty
Btisan
使用左连接来合并。
“左外连接从表 A 中生成一组完整的记录,它们在表 B 中有匹配的记录。如果没有匹配,右侧将包含空。” -- 来源
pd .merge (df_a , df_b , on = 'subject_id' , how = 'left' )
subject_id
first_name_x
last_name_x
first_name_y
last_name_y
0
1
Alex
Anderson
NaN
NaN
1
2
Amy
Ackerman
NaN
NaN
2
3
Allen
Ali
NaN
NaN
3
4
Alice
Aoni
Billy
Bonder
4
5
Ayoung
Atiches
Brian
Black
# 合并时添加后缀以复制列名称
pd .merge (df_a , df_b , on = 'subject_id' , how = 'left' , suffixes = ('_left' , '_right' ))
subject_id
first_name_left
last_name_left
first_name_right
last_name_right
0
1
Alex
Anderson
NaN
NaN
1
2
Amy
Ackerman
NaN
NaN
2
3
Allen
Ali
NaN
NaN
3
4
Alice
Aoni
Billy
Bonder
4
5
Ayoung
Atiches
Brian
Black
# 基于索引的合并
pd .merge (df_a , df_b , right_index = True , left_index = True )
subject_id_x
first_name_x
last_name_x
subject_id_y
first_name_y
last_name_y
0
1
Alex
Anderson
4
Billy
Bonder
1
2
Amy
Ackerman
5
Brian
Black
2
3
Allen
Ali
6
Bran
Balwner
3
4
Alice
Aoni
7
Bryce
Brice
4
5
Ayoung
Atiches
8
Betty
Btisan
特别感谢 Bob Haffner 指出了一种更好的方法。
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd .set_option ('display.max_row' , 1000 )
# 设置 ipython 的最大列宽
pd .set_option ('display.max_columns' , 50 )
# 创建示例数据帧
data = {'name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'year' : [2012 , 2012 , 2013 , 2014 , 2014 ],
'reports' : [4 , 24 , 31 , 2 , 3 ]}
df = pd .DataFrame (data , index = ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ])
df
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
# 列出 df['name'] 的唯一值
df .name .unique ()
# array(['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], dtype=object)
# 加载库
import pandas as pd
# 创建 JSON 文件的 URL(或者可以是文件路径)
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/data.json'
# 将 JSON 文件加载到数据框中
df = pd .read_json (url , orient = 'columns' )
# 查看前十行
df .head (10 )
category
datetime
integer
0
0
2015-01-01 00:00:00
5
1
0
2015-01-01 00:00:01
5
10
0
2015-01-01 00:00:10
5
11
0
2015-01-01 00:00:11
5
12
0
2015-01-01 00:00:12
8
13
0
2015-01-01 00:00:13
9
14
0
2015-01-01 00:00:14
8
15
0
2015-01-01 00:00:15
8
16
0
2015-01-01 00:00:16
2
17
0
2015-01-01 00:00:17
1
# 加载库
import pandas as pd
# 创建 Excel 文件的 URL(或者可以是文件路径)
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/data.xlsx'
# 将 Excel 文件的第一页加载到数据框中
df = pd .read_excel (url , sheetname = 0 , header = 1 )
# 查看前十行
df .head (10 )
5
2015-01-01 00:00:00
0
0
5
2015-01-01 00:00:01
0
1
9
2015-01-01 00:00:02
0
2
6
2015-01-01 00:00:03
0
3
6
2015-01-01 00:00:04
0
4
9
2015-01-01 00:00:05
0
5
7
2015-01-01 00:00:06
0
6
1
2015-01-01 00:00:07
0
7
6
2015-01-01 00:00:08
0
8
9
2015-01-01 00:00:09
0
9
5
2015-01-01 00:00:10
0
# 导入模块
import pandas as pd
# 加载 excel 文件并赋给 xls_file
xls_file = pd .ExcelFile ('../data/example.xls' )
xls_file
# <pandas.io.excel.ExcelFile at 0x111912be0>
# 查看电子表格的名称
xls_file .sheet_names
# ['Sheet1']
# 将 xls 文件 的 Sheet1 加载为数据帧
df = xls_file .parse ('Sheet1' )
df
year
deaths_attacker
deaths_defender
soldiers_attacker
soldiers_defender
wounded_attacker
wounded_defender
0
1945
425
423
2532
37235
41
14
1
1956
242
264
6346
2523
214
1424
2
1964
323
1231
3341
2133
131
131
3
1969
223
23
6732
1245
12
12
4
1971
783
23
12563
2671
123
34
5
1981
436
42
2356
7832
124
124
6
1982
324
124
253
2622
264
1124
7
1992
3321
631
5277
3331
311
1431
8
1999
262
232
2732
2522
132
122
9
2004
843
213
6278
26773
623
2563
# 导入模块
import pandas as pd
import numpy as np
raw_data = {'first_name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , 'Jacobson' , "." , 'Milner' , 'Cooze' ],
'age' : [42 , 52 , 36 , 24 , 73 ],
'preTestScore' : [4 , 24 , 31 , "." , "." ],
'postTestScore' : ["25,000" , "94,000" , 57 , 62 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ])
df
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
4
25,000
1
Molly
Jacobson
52
24
94,000
2
Tina
.
36
31
57
3
Jake
Milner
24
.
62
4
Amy
Cooze
73
.
70
# 将数据帧保存为工作目录中的 csv
df .to_csv ('pandas_dataframe_importing_csv/example.csv' )
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' )
df
Unnamed: 0
first_name
last_name
age
preTestScore
postTestScore
0
0
Jason
Miller
42
4
25,000
1
1
Molly
Jacobson
52
24
94,000
2
2
Tina
.
36
31
57
3
3
Jake
Milner
24
.
62
4
4
Amy
Cooze
73
.
70
# 加载无头 CSV
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' , header = None )
df
0
1
2
3
4
5
0
NaN
first_name
last_name
age
preTestScore
postTestScore
1
0.0
Jason
Miller
42
4
25,000
2
1.0
Molly
Jacobson
52
24
94,000
3
2.0
Tina
.
36
31
57
4
3.0
Jake
Milner
24
.
62
5
4.0
Amy
Cooze
73
.
70
# 在加载 csv 时指定列名称
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' , names = ['UID' , 'First Name' , 'Last Name' , 'Age' , 'Pre-Test Score' , 'Post-Test Score' ])
df
UID
First Name
Last Name
Age
Pre-Test Score
Post-Test Score
0
NaN
first_name
last_name
age
preTestScore
postTestScore
1
0.0
Jason
Miller
42
4
25,000
2
1.0
Molly
Jacobson
52
24
94,000
3
2.0
Tina
.
36
31
57
4
3.0
Jake
Milner
24
.
62
5
4.0
Amy
Cooze
73
.
70
# 通过将索引列设置为 UID 来加载 csv
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' , index_col = 'UID' , names = ['UID' , 'First Name' , 'Last Name' , 'Age' , 'Pre-Test Score' , 'Post-Test Score' ])
df
First Name
Last Name
Age
Pre-Test Score
Post-Test Score
UID
NaN
first_name
last_name
age
preTestScore
postTestScore
0.0
Jason
Miller
42
4
25,000
1.0
Molly
Jacobson
52
24
94,000
2.0
Tina
.
36
31
57
3.0
Jake
Milner
24
.
62
4.0
Amy
Cooze
73
.
70
# 在加载 csv 时将索引列设置为名字和姓氏
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' , index_col = ['First Name' , 'Last Name' ], names = ['UID' , 'First Name' , 'Last Name' , 'Age' , 'Pre-Test Score' , 'Post-Test Score' ])
df
UID
Age
Pre-Test Score
Post-Test Score
First Name
Last Name
first_name
last_name
NaN
age
preTestScore
postTestScore
Jason
Miller
0.0
42
4
25,000
Molly
Jacobson
1.0
52
24
94,000
Tina
.
2.0
36
31
57
Jake
Milner
3.0
24
.
62
Amy
Cooze
4.0
73
.
70
# 在加载 csv 时指定 '.' 为缺失值
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' , na_values = ['.' ])
pd .isnull (df )
Unnamed: 0
first_name
last_name
age
preTestScore
postTestScore
0
False
False
False
False
False
False
1
False
False
False
False
False
False
2
False
False
True
False
False
False
3
False
False
False
False
True
False
4
False
False
False
False
True
False
# 加载csv,同时指定 '.' 和 'NA' 为“姓氏”列的缺失值,指定 '.' 为 preTestScore 列的缺失值
sentinels = {'Last Name' : ['.' , 'NA' ], 'Pre-Test Score' : ['.' ]}
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' , na_values = sentinels )
df
Unnamed: 0
first_name
last_name
age
preTestScore
postTestScore
0
0
Jason
Miller
42
4
25,000
1
1
Molly
Jacobson
52
24
94,000
2
2
Tina
.
36
31
57
3
3
Jake
Milner
24
.
62
4
4
Amy
Cooze
73
.
70
# 在加载 csv 时跳过前 3 行
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' , na_values = sentinels , skiprows = 3 )
df
2
Tina
.
36
31
57
0
3
Jake
Milner
24
.
62
1
4
Amy
Cooze
73
.
70
# 加载 csv,同时将数字字符串中的 ',' 解释为千位分隔符
df = pd .read_csv ('pandas_dataframe_importing_csv/example.csv' , thousands = ',' )
df
Unnamed: 0
first_name
last_name
age
preTestScore
postTestScore
0
0
Jason
Miller
42
4
25000
1
1
Molly
Jacobson
52
24
94000
2
2
Tina
.
36
31
57
3
3
Jake
Milner
24
.
62
4
4
Amy
Cooze
73
.
70
# 导入模块
import pandas as pd
raw_data = {'patient' : [1 , 1 , 1 , 2 , 2 ],
'obs' : [1 , 2 , 3 , 1 , 2 ],
'treatment' : [0 , 1 , 0 , 1 , 0 ],
'score' : [6252 , 24243 , 2345 , 2342 , 23525 ]}
df = pd .DataFrame (raw_data , columns = ['patient' , 'obs' , 'treatment' , 'score' ])
df
patient
obs
treatment
score
0
1
1
0
6252
1
1
2
1
24243
2
1
3
0
2345
3
2
1
1
2342
4
2
2
0
23525
制作“宽的”数据。
现在,我们将创建一个“宽的”数据帧,其中行数按患者编号,列按观测编号,单元格值为得分值。
df .pivot (index = 'patient' , columns = 'obs' , values = 'score' )
obs
1
2
3
patient
1
6252.0
24243.0
2345.0
2
2342.0
23525.0
NaN
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd .set_option ('display.max_row' , 1000 )
# 设置 ipython 的最大列宽
pd .set_option ('display.max_columns' , 50 )
# 创建示例数据帧
data = {'NAME' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'YEAR' : [2012 , 2012 , 2013 , 2014 , 2014 ],
'REPORTS' : [4 , 24 , 31 , 2 , 3 ]}
df = pd .DataFrame (data , index = ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ])
df
NAME
REPORTS
YEAR
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
# 小写列名称
# Map the lowering function to all column names
df .columns = map (str .lower , df .columns )
df
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
# 导入模块
import pandas as pd
# 示例数据帧
raw_data = {'regiment' : ['Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Scouts' , 'Scouts' , 'Scouts' , 'Scouts' ],
'company' : ['1st' , '1st' , '2nd' , '2nd' , '1st' , '1st' , '2nd' , '2nd' ,'1st' , '1st' , '2nd' , '2nd' ],
'name' : ['Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' , 'Jacon' , 'Ryaner' , 'Sone' , 'Sloan' , 'Piger' , 'Riani' , 'Ali' ],
'preTestScore' : [4 , 24 , 31 , 2 , 3 , 4 , 24 , 31 , 2 , 3 , 2 , 3 ],
'postTestScore' : [25 , 94 , 57 , 62 , 70 , 25 , 94 , 57 , 62 , 70 , 62 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['regiment' , 'company' , 'name' , 'preTestScore' , 'postTestScore' ])
df
regiment
company
name
preTestScore
postTestScore
0
Nighthawks
1st
Miller
4
25
1
Nighthawks
1st
Jacobson
24
94
2
Nighthawks
2nd
Ali
31
57
3
Nighthawks
2nd
Milner
2
62
4
Dragoons
1st
Cooze
3
70
5
Dragoons
1st
Jacon
4
25
6
Dragoons
2nd
Ryaner
24
94
7
Dragoons
2nd
Sone
31
57
8
Scouts
1st
Sloan
2
62
9
Scouts
1st
Piger
3
70
10
Scouts
2nd
Riani
2
62
11
Scouts
2nd
Ali
3
70
# 创建一个接受两个输入,pre 和 post 的函数
def pre_post_difference (pre , post ):
# 返回二者的差
return post - pre
# 创建一个变量,它是函数的输出
df ['score_change' ] = pre_post_difference (df ['preTestScore' ], df ['postTestScore' ])
# 查看数据帧
df
regiment
company
name
preTestScore
postTestScore
score_change
0
Nighthawks
1st
Miller
4
25
21
1
Nighthawks
1st
Jacobson
24
94
70
2
Nighthawks
2nd
Ali
31
57
26
3
Nighthawks
2nd
Milner
2
62
60
4
Dragoons
1st
Cooze
3
70
67
5
Dragoons
1st
Jacon
4
25
21
6
Dragoons
2nd
Ryaner
24
94
70
7
Dragoons
2nd
Sone
31
57
26
8
Scouts
1st
Sloan
2
62
60
9
Scouts
1st
Piger
3
70
67
10
Scouts
2nd
Riani
2
62
60
11
Scouts
2nd
Ali
3
70
67
# 创建一个接受一个输入 x 的函数
def score_multipler_2x_and_3x (x ):
# 返回两个东西,2x 和 3x
return x * 2 , x * 3
# 创建两个新变量,它是函数的两个输出
df ['post_score_x2' ], df ['post_score_x3' ] = zip (* df ['postTestScore' ].map (score_multipler_2x_and_3x ))
df
regiment
company
name
preTestScore
postTestScore
score_change
post_score_x2
post_score_x3
0
Nighthawks
1st
Miller
4
25
21
50
75
1
Nighthawks
1st
Jacobson
24
94
70
188
282
2
Nighthawks
2nd
Ali
31
57
26
114
171
3
Nighthawks
2nd
Milner
2
62
60
124
186
4
Dragoons
1st
Cooze
3
70
67
140
210
5
Dragoons
1st
Jacon
4
25
21
50
75
6
Dragoons
2nd
Ryaner
24
94
70
188
282
7
Dragoons
2nd
Sone
31
57
26
114
171
8
Scouts
1st
Sloan
2
62
60
124
186
9
Scouts
1st
Piger
3
70
67
140
210
10
Scouts
2nd
Riani
2
62
60
124
186
11
Scouts
2nd
Ali
3
70
67
140
210
# 导入模块
import pandas as pd
raw_data = {'first_name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ],
'age' : [42 , 52 , 36 , 24 , 73 ],
'city' : ['San Francisco' , 'Baltimore' , 'Miami' , 'Douglas' , 'Boston' ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'age' , 'city' ])
df
first_name
last_name
age
city
0
Jason
Miller
42
San Francisco
1
Molly
Jacobson
52
Baltimore
2
Tina
Ali
36
Miami
3
Jake
Milner
24
Douglas
4
Amy
Cooze
73
Boston
# 创建值的字典
city_to_state = { 'San Francisco' : 'California' ,
'Baltimore' : 'Maryland' ,
'Miami' : 'Florida' ,
'Douglas' : 'Arizona' ,
'Boston' : 'Massachusetts' }
df ['state' ] = df ['city' ].map (city_to_state )
df
first_name
last_name
age
city
state
0
Jason
Miller
42
San Francisco
California
1
Molly
Jacobson
52
Baltimore
Maryland
2
Tina
Ali
36
Miami
Florida
3
Jake
Milner
24
Douglas
Arizona
4
Amy
Cooze
73
Boston
Massachusetts
# 导入模块
import pandas as pd
import numpy as np
raw_data = {'first_name' : ['Jason' , np .nan , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , np .nan , 'Ali' , 'Milner' , 'Cooze' ],
'age' : [42 , np .nan , 36 , 24 , 73 ],
'sex' : ['m' , np .nan , 'f' , 'm' , 'f' ],
'preTestScore' : [4 , np .nan , np .nan , 2 , 3 ],
'postTestScore' : [25 , np .nan , np .nan , 62 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'age' , 'sex' , 'preTestScore' , 'postTestScore' ])
df
first_name
last_name
age
sex
preTestScore
postTestScore
0
Jason
Miller
42.0
m
4.0
25.0
1
NaN
NaN
NaN
NaN
NaN
NaN
2
Tina
Ali
36.0
f
NaN
NaN
3
Jake
Milner
24.0
m
2.0
62.0
4
Amy
Cooze
73.0
f
3.0
70.0
# 丢弃缺失值
df_no_missing = df .dropna ()
df_no_missing
first_name
last_name
age
sex
preTestScore
postTestScore
0
Jason
Miller
42.0
m
4.0
25.0
3
Jake
Milner
24.0
m
2.0
62.0
4
Amy
Cooze
73.0
f
3.0
70.0
# 删除所有单元格为 NA 的行
df_cleaned = df .dropna (how = 'all' )
df_cleaned
first_name
last_name
age
sex
preTestScore
postTestScore
0
Jason
Miller
42.0
m
4.0
25.0
2
Tina
Ali
36.0
f
NaN
NaN
3
Jake
Milner
24.0
m
2.0
62.0
4
Amy
Cooze
73.0
f
3.0
70.0
# 创建一个缺失值填充的新列
df ['location' ] = np .nan
df
first_name
last_name
age
sex
preTestScore
postTestScore
location
0
Jason
Miller
42.0
m
4.0
25.0
NaN
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2
Tina
Ali
36.0
f
NaN
NaN
NaN
3
Jake
Milner
24.0
m
2.0
62.0
NaN
4
Amy
Cooze
73.0
f
3.0
70.0
NaN
# 如果列仅包含缺失值,删除列
df .dropna (axis = 1 , how = 'all' )
first_name
last_name
age
sex
preTestScore
postTestScore
0
Jason
Miller
42.0
m
4.0
25.0
1
NaN
NaN
NaN
NaN
NaN
NaN
2
Tina
Ali
36.0
f
NaN
NaN
3
Jake
Milner
24.0
m
2.0
62.0
4
Amy
Cooze
73.0
f
3.0
70.0
# 删除少于五个观测值的行
# 这对时间序列来说非常有用
df .dropna (thresh = 5 )
first_name
last_name
age
sex
preTestScore
postTestScore
location
0
Jason
Miller
42.0
m
4.0
25.0
NaN
3
Jake
Milner
24.0
m
2.0
62.0
NaN
4
Amy
Cooze
73.0
f
3.0
70.0
NaN
first_name
last_name
age
sex
preTestScore
postTestScore
location
0
Jason
Miller
42.0
m
4.0
25.0
0.0
1
0
0
0.0
0
0.0
0.0
0.0
2
Tina
Ali
36.0
f
0.0
0.0
0.0
3
Jake
Milner
24.0
m
2.0
62.0
0.0
4
Amy
Cooze
73.0
f
3.0
70.0
0.0
# 使用 preTestScore 的平均值填充 preTestScore 中的缺失
# inplace=True 表示更改会立即保存到 df 中
df ["preTestScore" ].fillna (df ["preTestScore" ].mean (), inplace = True )
df
first_name
last_name
age
sex
preTestScore
postTestScore
location
0
Jason
Miller
42.0
m
4.0
25.0
NaN
1
NaN
NaN
NaN
NaN
3.0
NaN
NaN
2
Tina
Ali
36.0
f
3.0
NaN
NaN
3
Jake
Milner
24.0
m
2.0
62.0
NaN
4
Amy
Cooze
73.0
f
3.0
70.0
NaN
# 使用 postTestScore 的每个性别的均值填充 postTestScore 中的缺失
df ["postTestScore" ].fillna (df .groupby ("sex" )["postTestScore" ].transform ("mean" ), inplace = True )
df
first_name
last_name
age
sex
preTestScore
postTestScore
location
0
Jason
Miller
42.0
m
4.0
25.0
NaN
1
NaN
NaN
NaN
NaN
3.0
NaN
NaN
2
Tina
Ali
36.0
f
3.0
70.0
NaN
3
Jake
Milner
24.0
m
2.0
62.0
NaN
4
Amy
Cooze
73.0
f
3.0
70.0
NaN
# 选择年龄不是 NaN 且性别不是 NaN 的行
df [df ['age' ].notnull () & df ['sex' ].notnull ()]
first_name
last_name
age
sex
preTestScore
postTestScore
location
0
Jason
Miller
42.0
m
4.0
25.0
NaN
2
Tina
Ali
36.0
f
3.0
70.0
NaN
3
Jake
Milner
24.0
m
2.0
62.0
NaN
4
Amy
Cooze
73.0
f
3.0
70.0
NaN
# 导入模块
import pandas as pd
# 创建数据
data = {'score' : [1 ,1 ,1 ,2 ,2 ,2 ,3 ,3 ,3 ]}
# 创建数据帧
df = pd .DataFrame (data )
# 查看数据帧
df
score
0
1
1
1
2
1
3
2
4
2
5
2
6
3
7
3
8
3
# 计算移动平均。也就是说,取前两个值,取平均值
# 然后丢弃第一个,再加上第三个,以此类推。
df .rolling (window = 2 ).mean ()
score
0
NaN
1
1.0
2
1.0
3
1.5
4
2.0
5
2.0
6
2.5
7
3.0
8
3.0
# 导入所需模块
import pandas as pd
from sklearn import preprocessing
# 设置图表为内联
% matplotlib inline
# 创建示例数据帧,带有未规范化的一列
data = {'score' : [234 ,24 ,14 ,27 ,- 74 ,46 ,73 ,- 18 ,59 ,160 ]}
df = pd .DataFrame (data )
df
score
0
234
1
24
2
14
3
27
4
-74
5
46
6
73
7
-18
8
59
9
160
# 查看为未规范化的数据
df ['score' ].plot (kind = 'bar' )
# <matplotlib.axes._subplots.AxesSubplot at 0x11b9c88d0>
# 创建 x,其中 x 的得分列的值为浮点数
x = df [['score' ]].values .astype (float )
# 创建 minmax 处理器对象
min_max_scaler = preprocessing .MinMaxScaler ()
# 创建一个对象,转换数据,拟合 minmax 处理器
x_scaled = min_max_scaler .fit_transform (x )
# 在数据帧上运行规范化器
df_normalized = pd .DataFrame (x_scaled )
# 查看数据帧
df_normalized
0
0
1.000000
1
0.318182
2
0.285714
3
0.327922
4
0.000000
5
0.389610
6
0.477273
7
0.181818
8
0.431818
9
0.759740
# 绘制数据帧
df_normalized .plot (kind = 'bar' )
# <matplotlib.axes._subplots.AxesSubplot at 0x11ba31c50>
# 导入模块
import pandas as pd
raw_data = {'regiment' : ['Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Nighthawks' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Dragoons' , 'Scouts' , 'Scouts' , 'Scouts' , 'Scouts' ],
'company' : ['1st' , '1st' , '2nd' , '2nd' , '1st' , '1st' , '2nd' , '2nd' ,'1st' , '1st' , '2nd' , '2nd' ],
'TestScore' : [4 , 24 , 31 , 2 , 3 , 4 , 24 , 31 , 2 , 3 , 2 , 3 ]}
df = pd .DataFrame (raw_data , columns = ['regiment' , 'company' , 'TestScore' ])
df
regiment
company
TestScore
0
Nighthawks
1st
4
1
Nighthawks
1st
24
2
Nighthawks
2nd
31
3
Nighthawks
2nd
2
4
Dragoons
1st
3
5
Dragoons
1st
4
6
Dragoons
2nd
24
7
Dragoons
2nd
31
8
Scouts
1st
2
9
Scouts
1st
3
10
Scouts
2nd
2
11
Scouts
2nd
3
# 按公司和团队创建分组均值的透视表
pd .pivot_table (df , index = ['regiment' ,'company' ], aggfunc = 'mean' )
TestScore
regiment
company
Dragoons
1st
3.5
2nd
27.5
Nighthawks
1st
14.0
2nd
16.5
Scouts
1st
2.5
2nd
2.5
# 按公司和团队创建分组计数的透视表
df .pivot_table (index = ['regiment' ,'company' ], aggfunc = 'count' )
TestScore
regiment
company
Dragoons
1st
2
2nd
2
Nighthawks
1st
2
2nd
2
Scouts
1st
2
2nd
2
我经常需要或想要改变一串字符串中所有项目的大小写(例如BRAZIL
到Brazil
等)。 有很多方法可以实现这一目标,但我已经确定这是最容易和最快的方法。
# 导入 pandas
import pandas as pd
# 创建名称的列表
first_names = pd .Series (['Steve Murrey' , 'Jane Fonda' , 'Sara McGully' , 'Mary Jane' ])
# 打印列
first_names
'''
0 Steve Murrey
1 Jane Fonda
2 Sara McGully
3 Mary Jane
dtype: object
'''
# 打印列的小写
first_names .str .lower ()
'''
0 steve murrey
1 jane fonda
2 sara mcgully
3 mary jane
dtype: object
'''
# 打印列的大写
first_names .str .upper ()
'''
0 STEVE MURREY
1 JANE FONDA
2 SARA MCGULLY
3 MARY JANE
dtype: object
'''
# 打印列的标题大小写
first_names .str .title ()
'''
0 Steve Murrey
1 Jane Fonda
2 Sara Mcgully
3 Mary Jane
dtype: object
'''
# 打印以空格分割的列
first_names .str .split (" " )
'''
0 [Steve, Murrey]
1 [Jane, Fonda]
2 [Sara, McGully]
3 [Mary, Jane]
dtype: object
'''
# 打印首字母大写的列
first_names .str .capitalize ()
'''
0 Steve murrey
1 Jane fonda
2 Sara mcgully
3 Mary jane
dtype: object
'''
明白了吧。更多字符串方法在这里 。
# 导入模块
import pandas as pd
import numpy as np
raw_data = {'first_name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ],
'age' : [42 , 52 , 36 , 24 , 73 ],
'preTestScore' : [4 , 24 , 31 , 2 , 3 ],
'postTestScore' : [25 , 94 , 57 , 62 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ])
df
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
4
25
1
Molly
Jacobson
52
24
94
2
Tina
Ali
36
31
57
3
Jake
Milner
24
2
62
4
Amy
Cooze
73
3
70
# 不放回选择大小为 2 的随机子集
df .take (np .random .permutation (len (df ))[:2 ])
first_name
last_name
age
preTestScore
postTestScore
1
Molly
Jacobson
52
24
94
4
Amy
Cooze
73
3
70
# 导入模块
import pandas as pd
# 创建数据帧
data = {'name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'year' : [2012 , 2012 , 2013 , 2014 , 2014 ],
'reports' : [4 , 24 , 31 , 2 , 3 ],
'coverage' : [25 , 94 , 57 , 62 , 70 ]}
df = pd .DataFrame (data , index = ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ])
df
coverage
name
reports
year
Cochice
25
Jason
4
2012
Pima
94
Molly
24
2012
Santa Cruz
57
Tina
31
2013
Maricopa
62
Jake
2
2014
Yuma
70
Amy
3
2014
5 rows × 4 columns
# 创建一个新列,该列是 coverage 值的升序排名
df ['coverageRanked' ] = df ['coverage' ].rank (ascending = 1 )
df
coverage
name
reports
year
coverageRanked
Cochice
25
Jason
4
2012
1
Pima
94
Molly
24
2012
5
Santa Cruz
57
Tina
31
2013
2
Maricopa
62
Jake
2
2014
3
Yuma
70
Amy
3
2014
4
5 rows × 5 columns
# 导入正则包
import re
import sys
text = 'The quick brown fox jumped over the lazy black bear.'
three_letter_word = '\w{3}'
pattern_re = re .compile (three_letter_word ); pattern_re
re .compile (r'\w{3}' , re .UNICODE )
re_search = re .search ('..own' , text )
if re_search :
# 打印搜索结果
print (re_search .group ())
# brown
re.match
re.match()
仅用于匹配字符串的开头或整个字符串。对于其他任何内容,请使用re.search
。
Match all three letter words in text
# 在文本中匹配所有三个字母的单词
re_match = re .match ('..own' , text )
if re_match :
# 打印所有匹配
print (re_match .group ())
else :
# 打印这个
print ('No matches' )
# No matches
re.split
# 使用 'e' 作为分隔符拆分字符串。
re_split = re .split ('e' , text ); re_split
# ['Th', ' quick brown fox jump', 'd ov', 'r th', ' lazy black b', 'ar.']
re.sub
用其他东西替换正则表达式模式串。3
表示要进行的最大替换次数。
# 用 'E' 替换前三个 'e' 实例,然后打印出来
re_sub = re .sub ('e' , 'E' , text , 3 ); print (re_sub )
# ThE quick brown fox jumpEd ovEr the lazy black bear.
# 导入 regex
import re
# 创建一些数据
text = 'A flock of 120 quick brown foxes jumped over 30 lazy brown, bears.'
re .findall ('^A' , text )
# ['A']
re .findall ('bears.$' , text )
# ['bears.']
re .findall ('f..es' , text )
# ['foxes']
# 寻找所有元音
re .findall ('[aeiou]' , text )
# ['o', 'o', 'u', 'i', 'o', 'o', 'e', 'u', 'e', 'o', 'e', 'a', 'o', 'e', 'a']
# 查找不是小写元音的所有字符
re .findall ('[^aeiou]' , text )
'''
['A',
' ',
'f',
'l',
'c',
'k',
' ',
'f',
' ',
'1',
'2',
'0',
' ',
'q',
'c',
'k',
' ',
'b',
'r',
'w',
'n',
' ',
'f',
'x',
's',
' ',
'j',
'm',
'p',
'd',
' ',
'v',
'r',
' ',
'3',
'0',
' ',
'l',
'z',
'y',
' ',
'b',
'r',
'w',
'n',
',',
' ',
'b',
'r',
's',
'.']
'''
re .findall ('a|A' , text )
# ['A', 'a', 'a']
# 寻找任何 'fox' 的实例
re .findall ('(foxes)' , text )
# ['foxes']
# 寻找所有五个字母的单词
re .findall ('\w\w\w\w\w' , text )
# ['flock', 'quick', 'brown', 'foxes', 'jumpe', 'brown', 'bears']
re .findall ('\W\W' , text )
# [', ']
re .findall ('\s' , text )
# [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
re .findall ('\S\S' , text )
'''
['fl',
'oc',
'of',
'12',
'qu',
'ic',
'br',
'ow',
'fo',
'xe',
'ju',
'mp',
'ed',
'ov',
'er',
'30',
'la',
'zy',
'br',
'ow',
'n,',
'be',
'ar',
's.']
'''
re .findall ('\d\d\d' , text )
# ['120']
re .findall ('\D\D\D\D\D' , text )
'''
['A flo',
'ck of',
' quic',
'k bro',
'wn fo',
'xes j',
'umped',
' over',
' lazy',
' brow',
'n, be']
'''
re .findall ('\AA' , text )
# ['A']
re .findall ('bears.\Z' , text )
# ['bears.']
re .findall ('\b [foxes]' , text )
# []
re .findall ('\n ' , text )
# []
re .findall ('[Ff]oxes' , 'foxes Foxes Doxes' )
# ['foxes', 'Foxes']
re .findall ('[Ff]oxes' , 'foxes Foxes Doxes' )
# ['foxes', 'Foxes']
re .findall ('[a-z]' , 'foxes Foxes' )
# ['f', 'o', 'x', 'e', 's', 'o', 'x', 'e', 's']
re .findall ('[A-Z]' , 'foxes Foxes' )
# ['F']
re .findall ('[a-zA-Z0-9]' , 'foxes Foxes' )
# ['f', 'o', 'x', 'e', 's', 'F', 'o', 'x', 'e', 's']
re .findall ('[^aeiou]' , 'foxes Foxes' )
# ['f', 'x', 's', ' ', 'F', 'x', 's']
re .findall ('[^0-9]' , 'foxes Foxes' )
# ['f', 'o', 'x', 'e', 's', ' ', 'F', 'o', 'x', 'e', 's']
re .findall ('foxes?' , 'foxes Foxes' )
# ['foxes']
re .findall ('ox*' , 'foxes Foxes' )
# ['ox', 'ox']
re .findall ('ox+' , 'foxes Foxes' )
# ['ox', 'ox']
re .findall ('\d{3}' , text )
# ['120']
re .findall ('\d{2,}' , text )
# ['120', '30']
re .findall ('\d{2,3}' , text )
# ['120', '30']
re .findall ('^A' , text )
# ['A']
re .findall ('bears.$' , text )
# ['bears.']
re .findall ('\AA' , text )
# ['A']
re .findall ('bears.\Z' , text )
# ['bears.']
re .findall ('bears(?=.)' , text )
# ['bears']
re .findall ('foxes(?!!)' , 'foxes foxes!' )
# ['foxes']
re .findall ('foxes|foxes!' , 'foxes foxes!' )
# ['foxes', 'foxes']
re .findall ('fox(es!)' , 'foxes foxes!' )
# ['es!']
re .findall ('foxes(!)' , 'foxes foxes!' )
# ['!']
# 导入模块
import pandas as pd
import numpy as np
# 创建亚利桑那州南部的火灾风险序列
brushFireRisk = pd .Series ([34 , 23 , 12 , 23 ], index = ['Bisbee' , 'Douglas' , 'Sierra Vista' , 'Tombstone' ])
brushFireRisk
'''
Bisbee 34
Douglas 23
Sierra Vista 12
Tombstone 23
dtype: int64
'''
# 重索引这个序列并创建一个新的序列变量
brushFireRiskReindexed = brushFireRisk .reindex (['Tombstone' , 'Douglas' , 'Bisbee' , 'Sierra Vista' , 'Barley' , 'Tucson' ])
brushFireRiskReindexed
'''
Tombstone 23.0
Douglas 23.0
Bisbee 34.0
Sierra Vista 12.0
Barley NaN
Tucson NaN
dtype: float64
'''
# 重索引序列并在任何缺失的索引处填入 0
brushFireRiskReindexed = brushFireRisk .reindex (['Tombstone' , 'Douglas' , 'Bisbee' , 'Sierra Vista' , 'Barley' , 'Tucson' ], fill_value = 0 )
brushFireRiskReindexed
'''
Tombstone 23
Douglas 23
Bisbee 34
Sierra Vista 12
Barley 0
Tucson 0
dtype: int64
'''
# 创建数据帧
data = {'county' : ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ],
'year' : [2012 , 2012 , 2013 , 2014 , 2014 ],
'reports' : [4 , 24 , 31 , 2 , 3 ]}
df = pd .DataFrame (data )
df
county
reports
year
0
Cochice
4
2012
1
Pima
24
2012
2
Santa Cruz
31
2013
3
Maricopa
2
2014
4
Yuma
3
2014
# 更改行的顺序(索引)
df .reindex ([4 , 3 , 2 , 1 , 0 ])
county
reports
year
4
Yuma
3
2014
3
Maricopa
2
2014
2
Santa Cruz
31
2013
1
Pima
24
2012
0
Cochice
4
2012
# 更改列的顺序(索引)
columnsTitles = ['year' , 'reports' , 'county' ]
df .reindex (columns = columnsTitles )
year
reports
county
0
2012
4
Cochice
1
2012
24
Pima
2
2013
31
Santa Cruz
3
2014
2
Maricopa
4
2014
3
Yuma
来自 StackOverflow 上的 rgalbo 。
# 导入所需模块
import pandas as pd
# 创建列表的字典,作为值
raw_data = {'0' : ['first_name' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'1' : ['last_name' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ],
'2' : ['age' , 52 , 36 , 24 , 73 ],
'3' : ['preTestScore' , 24 , 31 , 2 , 3 ]}
# 创建数据帧
df = pd .DataFrame (raw_data )
# 查看数据帧
df
0
1
2
3
0
first_name
last_name
age
preTestScore
1
Molly
Jacobson
52
24
2
Tina
Ali
36
31
3
Jake
Milner
24
2
4
Amy
Cooze
73
3
# 从数据集的第一行创建一个名为 header 的新变量
header = df .iloc [0 ]
'''
0 first_name
1 last_name
2 age
3 preTestScore
Name: 0, dtype: object
'''
# 将数据帧替换为不包含第一行的新数据帧
df = df [1 :]
# 使用标题变量重命名数据帧的列值
df .rename (columns = header )
first_name
last_name
age
preTestScore
1
Molly
Jacobson
52
24
---
---
---
---
---
2
Tina
Ali
36
31
---
---
---
---
---
3
Jake
Milner
24
2
---
---
---
---
---
4
Amy
Cooze
73
3
---
---
---
---
---
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd .set_option ('display.max_row' , 1000 )
# 设置 ipython 的最大列宽
pd .set_option ('display.max_columns' , 50 )
# 创建示例数据帧
data = {'Commander' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'Date' : ['2012, 02, 08' , '2012, 02, 08' , '2012, 02, 08' , '2012, 02, 08' , '2012, 02, 08' ],
'Score' : [4 , 24 , 31 , 2 , 3 ]}
df = pd .DataFrame (data , index = ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ])
df
Commander
Date
Score
Cochice
Jason
2012, 02, 08
4
Pima
Molly
2012, 02, 08
24
Santa Cruz
Tina
2012, 02, 08
31
Maricopa
Jake
2012, 02, 08
2
Yuma
Amy
2012, 02, 08
3
# 重命名列名
df .columns = ['Leader' , 'Time' , 'Score' ]
df
Leader
Time
Score
Cochice
Jason
2012, 02, 08
4
Pima
Molly
2012, 02, 08
24
Santa Cruz
Tina
2012, 02, 08
31
Maricopa
Jake
2012, 02, 08
2
Yuma
Amy
2012, 02, 08
3
df .rename (columns = {'Leader' : 'Commander' }, inplace = True )
df
Commander
Time
Score
Cochice
Jason
2012, 02, 08
4
Pima
Molly
2012, 02, 08
24
Santa Cruz
Tina
2012, 02, 08
31
Maricopa
Jake
2012, 02, 08
2
Yuma
Amy
2012, 02, 08
3
# 导入模块
import pandas as pd
import numpy as np
raw_data = {'first_name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ],
'age' : [42 , 52 , 36 , 24 , 73 ],
'preTestScore' : [- 999 , - 999 , - 999 , 2 , 1 ],
'postTestScore' : [2 , 2 , - 999 , 2 , - 999 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ])
df
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
-999
2
1
Molly
Jacobson
52
-999
2
2
Tina
Ali
36
-999
-999
3
Jake
Milner
24
2
2
4
Amy
Cooze
73
1
-999
# 将所有 -999 替换为 NAN
df .replace (- 999 , np .nan )
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
NaN
2.0
1
Molly
Jacobson
52
NaN
2.0
2
Tina
Ali
36
NaN
NaN
3
Jake
Milner
24
2.0
2.0
4
Amy
Cooze
73
1.0
NaN
# 导入模块
import pandas as pd
raw_data = {'first_name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ],
'age' : [42 , 52 , 36 , 24 , 73 ],
'preTestScore' : [4 , 24 , 31 , 2 , 3 ],
'postTestScore' : [25 , 94 , 57 , 62 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ])
df
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
4
25
1
Molly
Jacobson
52
24
94
2
Tina
Ali
36
31
57
3
Jake
Milner
24
2
62
4
Amy
Cooze
73
3
70
将名为df
的数据帧保存为 csv。
# 导入模块
import pandas as pd
raw_data = {'first_name' : ['Jason' , 'Jason' , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , 'Miller' , 'Ali' , 'Milner' , 'Cooze' ],
'age' : [42 , 42 , 36 , 24 , 73 ],
'preTestScore' : [4 , 4 , 31 , 2 , 3 ],
'postTestScore' : [25 , 25 , 57 , 62 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ])
df
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
4
25
1
Jason
Miller
42
4
25
2
Tina
Ali
36
31
57
3
Jake
Milner
24
2
62
4
Amy
Cooze
73
3
70
# 在列中寻找值在哪里
# 查看 postTestscore 大于 50 的地方
df ['preTestScore' ].where (df ['postTestScore' ] > 50 )
'''
0 NaN
1 NaN
2 31.0
3 2.0
4 3.0
Name: preTestScore, dtype: float64
'''
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd .set_option ('display.max_row' , 1000 )
# 设置 ipython 的最大列宽
pd .set_option ('display.max_columns' , 50 )
# 创建示例数据帧
data = {'name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'year' : [2012 , 2012 , 2013 , 2014 , 2014 ],
'reports' : [4 , 24 , 31 , 2 , 3 ]}
df = pd .DataFrame (data , index = ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ])
df
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
# 按照列值抓取行
value_list = ['Tina' , 'Molly' , 'Jason' ]
df [df .name .isin (value_list )]
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
# 获取列值不是某个值的行
df [~ df .name .isin (value_list )]
name
reports
year
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
import pandas as pd
# 创建示例数据帧
data = {'name' : ['Jason' , 'Molly' ],
'country' : [['Syria' , 'Lebanon' ],['Spain' , 'Morocco' ]]}
df = pd .DataFrame (data )
df
country
name
0
[Syria, Lebanon]
Jason
1
[Spain, Morocco]
Molly
df [df ['country' ].map (lambda country : 'Syria' in country )]
country
name
0
[Syria, Lebanon]
Jason
import pandas as pd
# 创建示例数据帧
data = {'name' : ['A' , 'B' , 'C' , 'D' , 'E' ],
'score' : [1 ,2 ,3 ,4 ,5 ]}
df = pd .DataFrame (data )
df
name
score
0
A
1
1
B
2
2
C
3
3
D
4
4
E
5
# 选择数据帧的行,其中 df.score 大于 1 且小于 5
df [(df ['score' ] > 1 ) & (df ['score' ] < 5 )]
name
score
1
B
2
2
C
3
3
D
4
# 导入模块
import pandas as pd
import numpy as np
# 创建数据帧
raw_data = {'first_name' : ['Jason' , 'Molly' , np .nan , np .nan , np .nan ],
'nationality' : ['USA' , 'USA' , 'France' , 'UK' , 'UK' ],
'age' : [42 , 52 , 36 , 24 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'nationality' , 'age' ])
df
first_name
nationality
age
0
Jason
USA
42
1
Molly
USA
52
2
NaN
France
36
3
NaN
UK
24
4
NaN
UK
70
# 方法 1:使用布尔变量
# 如果国籍是美国,则变量为 TRUE
american = df ['nationality' ] == "USA"
# 如果年龄大于 50,则变量为 TRUE
elderly = df ['age' ] > 50
# 选择所有国籍为美国且年龄大于 50 的案例
df [american & elderly ]
first_name
nationality
age
1
Molly
USA
52
# 方法 2:使用变量属性
# 选择所有不缺少名字且国籍为美国的案例
df [df ['first_name' ].notnull () & (df ['nationality' ] == "USA" )]
first_name
nationality
age
0
Jason
USA
42
1
Molly
USA
52
# 导入模块
import pandas as pd
raw_data = {'first_name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ],
'age' : [42 , 52 , 36 , 24 , 73 ],
'preTestScore' : [4 , 24 , 31 , 2 , 3 ],
'postTestScore' : [25 , 94 , 57 , 62 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ])
df
first_name
last_name
age
preTestScore
postTestScore
0
Jason
Miller
42
4
25
1
Molly
Jacobson
52
24
94
2
Tina
Ali
36
31
57
3
Jake
Milner
24
2
62
4
Amy
Cooze
73
3
70
# 创建第二个数据帧
raw_data_2 = {'first_name' : ['Sarah' , 'Gueniva' , 'Know' , 'Sara' , 'Cat' ],
'last_name' : ['Mornig' , 'Jaker' , 'Alom' , 'Ormon' , 'Koozer' ],
'age' : [53 , 26 , 72 , 73 , 24 ],
'preTestScore' : [13 , 52 , 72 , 26 , 26 ],
'postTestScore' : [82 , 52 , 56 , 234 , 254 ]}
df_2 = pd .DataFrame (raw_data_2 , columns = ['first_name' , 'last_name' , 'age' , 'preTestScore' , 'postTestScore' ])
df_2
first_name
last_name
age
preTestScore
postTestScore
0
Sarah
Mornig
53
13
82
1
Gueniva
Jaker
26
52
52
2
Know
Alom
72
72
56
3
Sara
Ormon
73
26
234
4
Cat
Koozer
24
26
254
# 创建第三个数据帧
raw_data_3 = {'first_name' : ['Sarah' , 'Gueniva' , 'Know' , 'Sara' , 'Cat' ],
'last_name' : ['Mornig' , 'Jaker' , 'Alom' , 'Ormon' , 'Koozer' ],
'postTestScore_2' : [82 , 52 , 56 , 234 , 254 ]}
df_3 = pd .DataFrame (raw_data_3 , columns = ['first_name' , 'last_name' , 'postTestScore_2' ])
df_3
first_name
last_name
postTestScore_2
0
Sarah
Mornig
82
1
Gueniva
Jaker
52
2
Know
Alom
56
3
Sara
Ormon
234
4
Cat
Koozer
254
# 导入模块
import pandas as pd
data = {'name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'year' : [2012 , 2012 , 2013 , 2014 , 2014 ],
'reports' : [1 , 2 , 1 , 2 , 3 ],
'coverage' : [2 , 2 , 3 , 3 , 3 ]}
df = pd .DataFrame (data , index = ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ])
df
coverage
name
reports
year
Cochice
2
Jason
1
2012
Pima
2
Molly
2
2012
Santa Cruz
3
Tina
1
2013
Maricopa
3
Jake
2
2014
Yuma
3
Amy
3
2014
# 按报告对数据框的行降序排序
df .sort_values (by = 'reports' , ascending = 0 )
coverage
name
reports
year
Yuma
3
Amy
3
2014
Pima
2
Molly
2
2012
Maricopa
3
Jake
2
2014
Cochice
2
Jason
1
2012
Santa Cruz
3
Tina
1
2013
# 按 coverage 然后是报告对数据帧的行升序排序
df .sort_values (by = ['coverage' , 'reports' ])
coverage
name
reports
year
Cochice
2
Jason
1
2012
Pima
2
Molly
2
2012
Santa Cruz
3
Tina
1
2013
Maricopa
3
Jake
2
2014
Yuma
3
Amy
3
2014
import pandas as pd
import numpy as np
raw_data = {'geo' : ['40.0024, -105.4102' , '40.0068, -105.266' , '39.9318, -105.2813' , np .nan ]}
df = pd .DataFrame (raw_data , columns = ['geo' ])
df
geo
0
40.0024, -105.4102
1
40.0068, -105.266
2
39.9318, -105.2813
3
NaN
---
---
# 为要放置的循环结果创建两个列表
lat = []
lon = []
# 对于变量中的每一行
for row in df ['geo' ]:
# Try to,
try :
# 用逗号分隔行,转换为浮点
# 并将逗号前的所有内容追加到 lat
lat .append (row .split (',' )[0 ])
# 用逗号分隔行,转换为浮点
# 并将逗号后的所有内容追加到 lon
lon .append (row .split (',' )[1 ])
# 但是如果你得到了错误
except :
# 向 lat 添加缺失值
lat .append (np .NaN )
# 向 lon 添加缺失值
lon .append (np .NaN )
# 从 lat 和 lon 创建新的两列
df ['latitude' ] = lat
df ['longitude' ] = lon
df
geo
latitude
longitude
0
40.0024, -105.4102
40.0024
-105.4102
1
40.0068, -105.266
40.0068
-105.266
2
39.9318, -105.2813
39.9318
-105.2813
3
NaN
NaN
NaN
# 创建一些原始数据
raw_data = [1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ]
# 定义产生 input+6 的生成器
def add_6 (numbers ):
for x in numbers :
output = x + 6
yield output
# 定义产生 input-2 的生成器
def subtract_2 (numbers ):
for x in numbers :
output = x - 2
yield output
# 定义产生 input*100 的生成器
def multiply_by_100 (numbers ):
for x in numbers :
output = x * 100
yield output
# 流水线的第一步
step1 = add_6 (raw_data )
# 流水线的第二步
step2 = subtract_2 (step1 )
# 流水线的第三步
pipeline = multiply_by_100 (step2 )
# 原始数据的第一个元素
next (pipeline )
# 500
# 原始数据的第二个元素
next (pipeline )
# 600
# 处理所有数据
for raw_data in pipeline :
print (raw_data )
'''
700
800
900
1000
1100
1200
1300
1400
'''
# 导入模块
import pandas as pd
import numpy as np
import re as re
raw_data = {'first_name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'last_name' : ['Miller' , 'Jacobson' , 'Ali' , 'Milner' , 'Cooze' ],
'email' : ['[[email protected]](/cdn-cgi/l/email-protection)' , '[[email protected]](/cdn-cgi/l/email-protection)' , np .NAN , '[[email protected]](/cdn-cgi/l/email-protection)' , '[[email protected]](/cdn-cgi/l/email-protection)' ],
'preTestScore' : [4 , 24 , 31 , 2 , 3 ],
'postTestScore' : [25 , 94 , 57 , 62 , 70 ]}
df = pd .DataFrame (raw_data , columns = ['first_name' , 'last_name' , 'email' , 'preTestScore' , 'postTestScore' ])
df
# 电子邮件列中的哪些字符串包含 'gmail'
df ['email' ].str .contains ('gmail' )
'''
0 True
1 True
2 NaN
3 False
4 False
Name: email, dtype: object
'''
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\ .([A-Z]{2,4})'
df ['email' ].str .findall (pattern , flags = re .IGNORECASE )
'''
0 [(jas203, gmail, com)]
1 [(momomolly, gmail, com)]
2 NaN
3 [(battler, milner, com)]
4 [(Ames1234, yahoo, com)]
Name: email, dtype: object
'''
matches = df ['email' ].str .match (pattern , flags = re .IGNORECASE )
matches
'''
/Users/chrisralbon/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
if __name__ == '__main__':
0 (jas203, gmail, com)
1 (momomolly, gmail, com)
2 NaN
3 (battler, milner, com)
4 (Ames1234, yahoo, com)
Name: email, dtype: object
'''
matches .str [1 ]
'''
0 gmail
1 gmail
2 NaN
3 milner
4 yahoo
Name: email, dtype: object
'''
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd .set_option ('display.max_row' , 1000 )
# 设置 ipython 的最大列宽
pd .set_option ('display.max_columns' , 50 )
data = {'name' : ['Jason' , 'Molly' , 'Tina' , 'Jake' , 'Amy' ],
'year' : [2012 , 2012 , 2013 , 2014 , 2014 ],
'reports' : [4 , 24 , 31 , 2 , 3 ]}
df = pd .DataFrame (data , index = ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ])
df
name
reports
year
Cochice
Jason
4
2012
Pima
Molly
24
2012
Santa Cruz
Tina
31
2013
Maricopa
Jake
2
2014
Yuma
Amy
3
2014
作为循环的列表推导式。
# 创建变量
next_year = []
# 对于 df.years 的每一行
for row in df ['year' ]:
# 为这一行添加 1 并将其附加到 next_year
next_year .append (row + 1 )
# 创建 df.next_year
df ['next_year' ] = next_year
# 查看数据帧
df
name
reports
year
next_year
Cochice
Jason
4
2012
2013
Pima
Molly
24
2012
2013
Santa Cruz
Tina
31
2013
2014
Maricopa
Jake
2
2014
2015
Yuma
Amy
3
2014
2015
作为列表推导式。
# 对于 df.year 中的每一行,从行中减去 1
df ['previous_year' ] = [row - 1 for row in df ['year' ]]
df
name
reports
year
next_year
previous_year
Cochice
Jason
4
2012
2013
2011
Pima
Molly
24
2012
2013
2011
Santa Cruz
Tina
31
2013
2014
2012
Maricopa
Jake
2
2014
2015
2013
Yuma
Amy
3
2014
2015
2013
import pandas as pd
% matplotlib inline
import random
import matplotlib .pyplot as plt
import seaborn as sns
df = pd .DataFrame ()
df ['x' ] = random .sample (range (1 , 100 ), 25 )
df ['y' ] = random .sample (range (1 , 100 ), 25 )
df .head ()
x
y
0
18
25
1
42
67
2
52
77
3
4
34
4
14
69
# 散点图
sns .lmplot ('x' , 'y' , data = df , fit_reg = False )
# <seaborn.axisgrid.FacetGrid at 0x114563b00>
# 密度图
sns .kdeplot (df .y )
# <matplotlib.axes._subplots.AxesSubplot at 0x113ea2ef0>
sns .kdeplot (df .y , df .x )
# <matplotlib.axes._subplots.AxesSubplot at 0x113d7fef0>
sns .distplot (df .x )
# <matplotlib.axes._subplots.AxesSubplot at 0x114294160>
# 直方图
plt .hist (df .x , alpha = .3 )
sns .rugplot (df .x );
# 箱形图
sns .boxplot ([df .y , df .x ])
# <matplotlib.axes._subplots.AxesSubplot at 0x1142b8b38>
# 提琴图
sns .violinplot ([df .y , df .x ])
# <matplotlib.axes._subplots.AxesSubplot at 0x114444a58>
# 热力图
sns .heatmap ([df .y , df .x ], annot = True , fmt = "d" )
# <matplotlib.axes._subplots.AxesSubplot at 0x114530c88>
# 聚类图
sns .clustermap (df )
# <seaborn.matrix.ClusterGrid at 0x116f313c8>
# 导入模块
import pandas as pd
序列是一维数组(类似 R 的向量)。
# 创建 floodingReports 数量的序列
floodingReports = pd .Series ([5 , 6 , 2 , 9 , 12 ])
floodingReports
'''
0 5
1 6
2 2
3 9
4 12
dtype: int64
'''
请注意,第一列数字(0 到 4)是索引。
# 将县名设置为 floodingReports 序列的索引
floodingReports = pd .Series ([5 , 6 , 2 , 9 , 12 ], index = ['Cochise County' , 'Pima County' , 'Santa Cruz County' , 'Maricopa County' , 'Yuma County' ])
floodingReports
'''
Cochise County 5
Pima County 6
Santa Cruz County 2
Maricopa County 9
Yuma County 12
dtype: int64
'''
floodingReports ['Cochise County' ]
# 5
floodingReports [floodingReports > 6 ]
'''
Maricopa County 9
Yuma County 12
dtype: int64
'''
从字典中创建 Pandas 序列。
注意:执行此操作时,字典的键将成为序列索引。
# 创建字典
fireReports_dict = {'Cochise County' : 12 , 'Pima County' : 342 , 'Santa Cruz County' : 13 , 'Maricopa County' : 42 , 'Yuma County' : 52 }
# 将字典转换为 pd.Series,然后查看它
fireReports = pd .Series (fireReports_dict ); fireReports
'''
Cochise County 12
Maricopa County 42
Pima County 342
Santa Cruz County 13
Yuma County 52
dtype: int64
'''
fireReports .index = ["Cochice" , "Pima" , "Santa Cruz" , "Maricopa" , "Yuma" ]
fireReports
'''
Cochice 12
Pima 42
Santa Cruz 342
Maricopa 13
Yuma 52
dtype: int64
'''
数据帧就像 R 的数据帧。
# 从等长列表或 NumPy 数组的字典中创建数据帧
data = {'county' : ['Cochice' , 'Pima' , 'Santa Cruz' , 'Maricopa' , 'Yuma' ],
'year' : [2012 , 2012 , 2013 , 2014 , 2014 ],
'reports' : [4 , 24 , 31 , 2 , 3 ]}
df = pd .DataFrame (data )
df
county
reports
year
0
Cochice
4
2012
1
Pima
24
2012
2
Santa Cruz
31
2013
3
Maricopa
2
2014
4
Yuma
3
2014
# 使用 columns 属性设置列的顺序
dfColumnOrdered = pd .DataFrame (data , columns = ['county' , 'year' , 'reports' ])
dfColumnOrdered
county
year
reports
0
Cochice
2012
4
1
Pima
2012
24
2
Santa Cruz
2013
31
3
Maricopa
2014
2
4
Yuma
2014
3
# 添加一列
dfColumnOrdered ['newsCoverage' ] = pd .Series ([42.3 , 92.1 , 12.2 , 39.3 , 30.2 ])
dfColumnOrdered
county
year
reports
newsCoverage
0
Cochice
2012
4
42.3
1
Pima
2012
24
92.1
2
Santa Cruz
2013
31
12.2
3
Maricopa
2014
2
39.3
4
Yuma
2014
3
30.2
# 删除一列
del dfColumnOrdered ['newsCoverage' ]
dfColumnOrdered
county
year
reports
0
Cochice
2012
4
1
Pima
2012
24
2
Santa Cruz
2013
31
3
Maricopa
2014
2
4
Yuma
2014
3
# 转置数据帧
dfColumnOrdered .T
0
1
2
3
4
county
Cochice
Pima
Santa Cruz
Maricopa
Yuma
year
2012
2012
2013
2014
2014
reports
4
24
31
2
3
# 导入模块
from datetime import datetime
import pandas as pd
% matplotlib inline
import matplotlib .pyplot as pyplot
data = {'date' : ['2014-05-01 18:47:05.069722' , '2014-05-01 18:47:05.119994' , '2014-05-02 18:47:05.178768' , '2014-05-02 18:47:05.230071' , '2014-05-02 18:47:05.230071' , '2014-05-02 18:47:05.280592' , '2014-05-03 18:47:05.332662' , '2014-05-03 18:47:05.385109' , '2014-05-04 18:47:05.436523' , '2014-05-04 18:47:05.486877' ],
'battle_deaths' : [34 , 25 , 26 , 15 , 15 , 14 , 26 , 25 , 62 , 41 ]}
df = pd .DataFrame (data , columns = ['date' , 'battle_deaths' ])
print (df )
'''
date battle_deaths
0 2014-05-01 18:47:05.069722 34
1 2014-05-01 18:47:05.119994 25
2 2014-05-02 18:47:05.178768 26
3 2014-05-02 18:47:05.230071 15
4 2014-05-02 18:47:05.230071 15
5 2014-05-02 18:47:05.280592 14
6 2014-05-03 18:47:05.332662 26
7 2014-05-03 18:47:05.385109 25
8 2014-05-04 18:47:05.436523 62
9 2014-05-04 18:47:05.486877 41
'''
df ['date' ] = pd .to_datetime (df ['date' ])
df .index = df ['date' ]
del df ['date' ]
df
battle_deaths
date
2014-05-01 18:47:05.069722
34
2014-05-01 18:47:05.119994
25
2014-05-02 18:47:05.178768
26
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.280592
14
2014-05-03 18:47:05.332662
26
2014-05-03 18:47:05.385109
25
2014-05-04 18:47:05.436523
62
2014-05-04 18:47:05.486877
41
# 查看 2014 年的所有观测
df ['2014' ]
battle_deaths
date
2014-05-01 18:47:05.069722
34
2014-05-01 18:47:05.119994
25
2014-05-02 18:47:05.178768
26
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.280592
14
2014-05-03 18:47:05.332662
26
2014-05-03 18:47:05.385109
25
2014-05-04 18:47:05.436523
62
2014-05-04 18:47:05.486877
41
# 查看 2014 年 5 月的所有观测
df ['2014-05' ]
battle_deaths
date
2014-05-01 18:47:05.069722
34
2014-05-01 18:47:05.119994
25
2014-05-02 18:47:05.178768
26
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.280592
14
2014-05-03 18:47:05.332662
26
2014-05-03 18:47:05.385109
25
2014-05-04 18:47:05.436523
62
2014-05-04 18:47:05.486877
41
# 查看 2014.5.3 的所有观测
df [datetime (2014 , 5 , 3 ):]
battle_deaths
date
2014-05-03 18:47:05.332662
26
2014-05-03 18:47:05.385109
25
2014-05-04 18:47:05.436523
62
2014-05-04 18:47:05.486877
41
Observations between May 3rd and May 4th
# 查看 2014.5.3~4 的所有观测
df ['5/3/2014' :'5/4/2014' ]
battle_deaths
date
2014-05-03 18:47:05.332662
26
2014-05-03 18:47:05.385109
25
2014-05-04 18:47:05.436523
62
2014-05-04 18:47:05.486877
41
# 截断 2014.5.2 之后的观测
df .truncate (after = '5/3/2014' )
battle_deaths
date
2014-05-01 18:47:05.069722
34
2014-05-01 18:47:05.119994
25
2014-05-02 18:47:05.178768
26
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.280592
14
# 2014.5 的观测
df ['5-2014' ]
battle_deaths
date
2014-05-01 18:47:05.069722
34
2014-05-01 18:47:05.119994
25
2014-05-02 18:47:05.178768
26
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.230071
15
2014-05-02 18:47:05.280592
14
2014-05-03 18:47:05.332662
26
2014-05-03 18:47:05.385109
25
2014-05-04 18:47:05.436523
62
2014-05-04 18:47:05.486877
41
# 计算每个时间戳的观测数
df .groupby (level = 0 ).count ()
battle_deaths
date
2014-05-01 18:47:05.069722
1
2014-05-01 18:47:05.119994
1
2014-05-02 18:47:05.178768
1
2014-05-02 18:47:05.230071
2
2014-05-02 18:47:05.280592
1
2014-05-03 18:47:05.332662
1
2014-05-03 18:47:05.385109
1
2014-05-04 18:47:05.436523
1
2014-05-04 18:47:05.486877
1
# 每天的 battle_deaths 均值
df .resample ('D' ).mean ()
battle_deaths
date
2014-05-01
29.5
2014-05-02
17.5
2014-05-03
25.5
2014-05-04
51.5
# 每天的 battle_deaths 总数
df .resample ('D' ).sum ()
battle_deaths
date
2014-05-01
59
2014-05-02
70
2014-05-03
51
2014-05-04
103
# 绘制每天的总死亡人数
df .resample ('D' ).sum ().plot ()
# <matplotlib.axes._subplots.AxesSubplot at 0x11187a940>