import pandas as pd
-import numpy as np
-
iris_df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv")
-iris_df
-
- | sepal_length | -sepal_width | -petal_length | -petal_width | -species | -
---|---|---|---|---|---|
0 | -5.1 | -3.5 | -1.4 | -0.2 | -setosa | -
1 | -4.9 | -3.0 | -1.4 | -0.2 | -setosa | -
2 | -4.7 | -3.2 | -1.3 | -0.2 | -setosa | -
3 | -4.6 | -3.1 | -1.5 | -0.2 | -setosa | -
4 | -5.0 | -3.6 | -1.4 | -0.2 | -setosa | -
... | -... | -... | -... | -... | -... | -
145 | -6.7 | -3.0 | -5.2 | -2.3 | -virginica | -
146 | -6.3 | -2.5 | -5.0 | -1.9 | -virginica | -
147 | -6.5 | -3.0 | -5.2 | -2.0 | -virginica | -
148 | -6.2 | -3.4 | -5.4 | -2.3 | -virginica | -
149 | -5.9 | -3.0 | -5.1 | -1.8 | -virginica | -
150 rows × 5 columns
-Concatenating and Merging#
-Concate: pd.concat()
#
-Concatenate pandas objects along an axis
- -Create two dfs and vertically stack them
-df1 = pd.DataFrame(np.random.randn(3, 4))
-df2 = pd.DataFrame(np.random.randn(3, 4))
-
-print(df1)
-print('-'*45)
-print(df2)
-
-df3 = pd.concat([df1, df2], axis=0)
-
-print('-'*45)
-print(df3)
-
0 1 2 3
-0 -0.974224 -0.361428 -1.260157 -0.425635
-1 0.917031 -0.046927 1.048192 0.075938
-2 0.624287 -0.835596 -2.225162 -2.354256
----------------------------------------------
- 0 1 2 3
-0 -0.367434 -2.306643 -1.030095 -0.373502
-1 -0.179604 -1.704118 0.127096 0.098003
-2 2.444379 0.584522 -0.991921 -0.355007
----------------------------------------------
- 0 1 2 3
-0 -0.974224 -0.361428 -1.260157 -0.425635
-1 0.917031 -0.046927 1.048192 0.075938
-2 0.624287 -0.835596 -2.225162 -2.354256
-0 -0.367434 -2.306643 -1.030095 -0.373502
-1 -0.179604 -1.704118 0.127096 0.098003
-2 2.444379 0.584522 -0.991921 -0.355007
-
Concat columns
-This assumes that the indexes represent IDs of specific things or events
df4 = pd.concat([df1,df2], axis = 1, keys = ['foo', 'bar'])
-
-df4
-
- | foo | -bar | -||||||
---|---|---|---|---|---|---|---|---|
- | 0 | -1 | -2 | -3 | -0 | -1 | -2 | -3 | -
0 | --0.974224 | --0.361428 | --1.260157 | --0.425635 | --0.367434 | --2.306643 | --1.030095 | --0.373502 | -
1 | -0.917031 | --0.046927 | -1.048192 | -0.075938 | --0.179604 | --1.704118 | -0.127096 | -0.098003 | -
2 | -0.624287 | --0.835596 | --2.225162 | --2.354256 | -2.444379 | -0.584522 | --0.991921 | --0.355007 | -
df4.foo
-
- | 0 | -1 | -2 | -3 | -
---|---|---|---|---|
0 | --0.974224 | --0.361428 | --1.260157 | --0.425635 | -
1 | -0.917031 | --0.046927 | -1.048192 | -0.075938 | -
2 | -0.624287 | --0.835596 | --2.225162 | --2.354256 | -
df4.bar
-
- | 0 | -1 | -2 | -3 | -
---|---|---|---|---|
0 | --0.367434 | --2.306643 | --1.030095 | --0.373502 | -
1 | --0.179604 | --1.704118 | -0.127096 | -0.098003 | -
2 | -2.444379 | -0.584522 | --0.991921 | --0.355007 | -
merge: merge()
#
-SQL-style joining of tables (DataFrames)
-Important parameters include:
--
-
how
: type of merge {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’
-on
: names to join on
-
Very useful!
-Create two tables, left
and right
. Then right join them on key
.
-Right join means include all records from table on right.
-The key
is used for matching up the records.
left = pd.DataFrame({"key": ["jamie", "bill"], "lval": [15, 22]})
-right = pd.DataFrame({"key": ["jamie", "bill", "asher"], "rval": [4, 5, 8]})
-
-joined = pd.merge(left, right, on="key", how="right")
-
-print('---left')
-print(left)
-print('\n---right')
-print(right)
-print('\n---joined')
-print(joined)
-
---left
- key lval
-0 jamie 15
-1 bill 22
-
----right
- key rval
-0 jamie 4
-1 bill 5
-2 asher 8
-
----joined
- key lval rval
-0 jamie 15.0 4
-1 bill 22.0 5
-2 asher NaN 8
-
Summary#
--
-
Use join if you have shared indexes
-Use merge if you do not have shared indexes
-Use concat to combine based on shared indexes or columns
-
Data Aggregation#
-Involves one or more of:
--
-
splitting the data into groups
-applying a function to each group
-combining results
-
Aggregation by .groupby()
#
-Compute mean of each column, grouped (separately) by species
-iris_df.groupby("species").mean()
-
- | sepal_length | -sepal_width | -petal_length | -petal_width | -
---|---|---|---|---|
species | -- | - | - | - |
setosa | -5.006 | -3.428 | -1.462 | -0.246 | -
versicolor | -5.936 | -2.770 | -4.260 | -1.326 | -
virginica | -6.588 | -2.974 | -5.552 | -2.026 | -
pd.pivot_table()
#
-Apply a function aggfunc
to selected values grouped by columns
Compute mean sepal length for each species:
-pd.pivot_table(iris_df, values="sepal_length", columns=["species"], aggfunc = np.mean)
-
species | -setosa | -versicolor | -virginica | -
---|---|---|---|
sepal_length | -5.006 | -5.936 | -6.588 | -
Reshaping Data#
-.reshape()
#
-Changes the object’s shape
-We illustrate creating pandas Series, extracting array of length 6, and reshaping to 3x2 array.
-# create a series
-ser = pd.Series([1, 1, 2, 3, 5, 8])
-
-# extract values
-vals = ser.values
-
-print('orig data:', vals)
-print('orig type:', type(vals))
-print('orig shape:', vals.shape)
-
-# reshaping series
-reshaped_vals = vals.reshape((3, 2))
-
-print('\n reshaped vals:')
-print(reshaped_vals)
-print('\n new type:', type(reshaped_vals))
-print('new shape:', reshaped_vals.shape)
-
orig data: [1 1 2 3 5 8]
-orig type: <class 'numpy.ndarray'>
-orig shape: (6,)
-
- reshaped vals:
-[[1 1]
- [2 3]
- [5 8]]
-
- new type: <class 'numpy.ndarray'>
-new shape: (3, 2)
-