Pandas¶

In [1]:
import pandas as pd

Background: dictionary¶

In [2]:
age = {'Anna': 6, 'Teresa': 9}
age
Out[2]:
{'Anna': 6, 'Teresa': 9}
In [3]:
age['Anna']
Out[3]:
6

Create, write, and read a DataFrame¶

In [4]:
df = pd.DataFrame({'item': ('milk', 'apples', 'bread'),
                   'price': (3.50, 0.40, 3.10),
                   'n': (2, 5, 1)})
df
Out[4]:
item price n
0 milk 3.5 2
1 apples 0.4 5
2 bread 3.1 1
In [5]:
df.to_csv(path_or_buf='groceries.csv', index=False)
groceries = pd.read_csv(filepath_or_buffer='groceries.csv')
groceries
Out[5]:
item price n
0 milk 3.5 2
1 apples 0.4 5
2 bread 3.1 1
In [6]:
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/DJIA.csv', index_col='Symbol')
df
Out[6]:
Exchange Industry State Price MarketCap AvgVol
Symbol
MMM NYSE Conglomerate Minnesota 157.0 8.985700e+10 2690000.0
AXP NYSE Financial New York 198.0 1.503370e+11 4330000.0
AMGN NASDAQ Biopharmaceutical California 224.0 1.249590e+11 3110000.0
AAPL NASDAQ IT California 173.0 2.820000e+12 103560000.0
BA NYSE Aerospace Illinois 218.0 1.269370e+11 10350000.0
CAT NYSE Construction Illinois 203.0 1.100330e+11 3450000.0
CVX NYSE Petroleum California 134.0 2.588120e+11 12440000.0
CSCO NASDAQ IT California 54.0 2.288900e+11 23770000.0
KO NYSE Soft Drink Georgia 61.0 2.634360e+11 19550000.0
DIS NYSE Broadcasting California 155.0 2.816880e+11 13540000.0
DOW NYSE Chemical Michigan 62.0 4.551300e+10 5700000.0
GS NYSE Financial New York 364.0 1.214020e+11 3150000.0
HD NYSE Home Improvement Georgia 353.0 3.688880e+11 4310000.0
HON NASDAQ Conglomerate North Carolina 190.0 1.299830e+11 3450000.0
IBM NYSE IT New York 130.0 1.164680e+11 5920000.0
INTC NASDAQ Semiconductor California 48.0 1.972480e+11 36540000.0
JNJ NYSE Drug New Jersey 167.0 4.404600e+11 7940000.0
JPM NYSE Financial New York 155.0 4.555110e+11 14440000.0
MCD NYSE Food Illinois 254.0 1.896580e+11 2810000.0
MRK NYSE Drug New Jersey 78.0 1.965430e+11 14070000.0
MSFT NASDAQ IT Washington 300.0 2.253000e+12 35580000.0
NKE NYSE Apparel Oregon 146.0 2.302060e+11 5890000.0
PG NYSE Consumer Ohio 157.0 3.759090e+11 8610000.0
CRM NYSE IT California 214.0 2.110360e+11 7120000.0
TRV NYSE Insurance New York 172.0 4.139000e+10 1580000.0
UNH NYSE Health Minnesota 478.0 4.504210e+11 3370000.0
VZ NYSE Telecom New York 53.0 2.239960e+11 23190000.0
V NYSE Financial California 228.0 4.933530e+11 10880000.0
WBA NASDAQ Retailing Illinois 48.0 4.169600e+10 6620000.0
WMT NYSE Retailing Arkansas 134.0 3.727260e+11 9630000.0

Inspect¶

In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, MMM to WMT
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Exchange   30 non-null     object 
 1   Industry   30 non-null     object 
 2   State      30 non-null     object 
 3   Price      30 non-null     float64
 4   MarketCap  30 non-null     float64
 5   AvgVol     30 non-null     float64
dtypes: float64(3), object(3)
memory usage: 1.6+ KB
In [8]:
df.shape
Out[8]:
(30, 6)
In [9]:
df.head()
Out[9]:
Exchange Industry State Price MarketCap AvgVol
Symbol
MMM NYSE Conglomerate Minnesota 157.0 8.985700e+10 2690000.0
AXP NYSE Financial New York 198.0 1.503370e+11 4330000.0
AMGN NASDAQ Biopharmaceutical California 224.0 1.249590e+11 3110000.0
AAPL NASDAQ IT California 173.0 2.820000e+12 103560000.0
BA NYSE Aerospace Illinois 218.0 1.269370e+11 10350000.0
In [10]:
df.tail()
Out[10]:
Exchange Industry State Price MarketCap AvgVol
Symbol
UNH NYSE Health Minnesota 478.0 4.504210e+11 3370000.0
VZ NYSE Telecom New York 53.0 2.239960e+11 23190000.0
V NYSE Financial California 228.0 4.933530e+11 10880000.0
WBA NASDAQ Retailing Illinois 48.0 4.169600e+10 6620000.0
WMT NYSE Retailing Arkansas 134.0 3.727260e+11 9630000.0
In [11]:
df.index
Out[11]:
Index(['MMM', 'AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CVX', 'CSCO', 'KO', 'DIS',
       'DOW', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'MCD', 'MRK',
       'MSFT', 'NKE', 'PG', 'CRM', 'TRV', 'UNH', 'VZ', 'V', 'WBA', 'WMT'],
      dtype='object', name='Symbol')
In [12]:
df.columns
Out[12]:
Index(['Exchange', 'Industry', 'State', 'Price', 'MarketCap', 'AvgVol'], dtype='object')
In [13]:
df.describe()
Out[13]:
Price MarketCap AvgVol
count 30.000000 3.000000e+01 3.000000e+01
mean 176.933333 3.803452e+11 1.358633e+07
std 100.994344 6.047850e+11 1.927975e+07
min 48.000000 4.139000e+10 1.580000e+06
25% 131.000000 1.254535e+11 3.665000e+06
50% 162.000000 2.175160e+11 7.530000e+06
75% 217.000000 3.717665e+11 1.393750e+07
max 478.000000 2.820000e+12 1.035600e+08
In [14]:
df.describe(include=[object])
Out[14]:
Exchange Industry State
count 30 30 30
unique 2 20 12
top NYSE IT California
freq 23 5 8
In [15]:
groceries.describe()
Out[15]:
price n
count 3.000000 3.000000
mean 2.333333 2.666667
std 1.686219 2.081666
min 0.400000 1.000000
25% 1.750000 1.500000
50% 3.100000 2.000000
75% 3.300000 3.500000
max 3.500000 5.000000
In [16]:
groceries.describe(include=[object])
Out[16]:
item
count 3
unique 3
top milk
freq 1
In [17]:
pd.crosstab(df['State'], df['Exchange'])
Out[17]:
Exchange NASDAQ NYSE
State
Arkansas 0 1
California 4 4
Georgia 0 2
Illinois 1 3
Michigan 0 1
Minnesota 0 2
New Jersey 0 2
New York 0 6
North Carolina 1 0
Ohio 0 1
Oregon 0 1
Washington 1 0
In [18]:
df.max()
Out[18]:
Exchange                NYSE
Industry             Telecom
State             Washington
Price                  478.0
MarketCap    2820000000000.0
AvgVol           103560000.0
dtype: object
In [19]:
df['Price'].max() # peek ahead to "Select a subset" first bullet
Out[19]:
478.0
In [20]:
df['MarketCap'].idxmax()
Out[20]:
'AAPL'
In [21]:
df['Price'].mean()
Out[21]:
176.93333333333334
In [22]:
df.Exchange.value_counts()
Out[22]:
Exchange
NYSE      23
NASDAQ     7
Name: count, dtype: int64
In [23]:
df.groupby('Exchange').groups.keys()
Out[23]:
dict_keys(['NASDAQ', 'NYSE'])
In [24]:
df.groupby('Exchange').mean(numeric_only=True)
Out[24]:
Price MarketCap AvgVol
Exchange
NASDAQ 148.142857 8.279680e+11 3.037571e+07
NYSE 185.695652 2.441122e+11 8.476522e+06

Select a subset¶

In [25]:
df.Price
Out[25]:
Symbol
MMM     157.0
AXP     198.0
AMGN    224.0
AAPL    173.0
BA      218.0
CAT     203.0
CVX     134.0
CSCO     54.0
KO       61.0
DIS     155.0
DOW      62.0
GS      364.0
HD      353.0
HON     190.0
IBM     130.0
INTC     48.0
JNJ     167.0
JPM     155.0
MCD     254.0
MRK      78.0
MSFT    300.0
NKE     146.0
PG      157.0
CRM     214.0
TRV     172.0
UNH     478.0
VZ       53.0
V       228.0
WBA      48.0
WMT     134.0
Name: Price, dtype: float64
In [26]:
df[['Price', 'MarketCap']]
Out[26]:
Price MarketCap
Symbol
MMM 157.0 8.985700e+10
AXP 198.0 1.503370e+11
AMGN 224.0 1.249590e+11
AAPL 173.0 2.820000e+12
BA 218.0 1.269370e+11
CAT 203.0 1.100330e+11
CVX 134.0 2.588120e+11
CSCO 54.0 2.288900e+11
KO 61.0 2.634360e+11
DIS 155.0 2.816880e+11
DOW 62.0 4.551300e+10
GS 364.0 1.214020e+11
HD 353.0 3.688880e+11
HON 190.0 1.299830e+11
IBM 130.0 1.164680e+11
INTC 48.0 1.972480e+11
JNJ 167.0 4.404600e+11
JPM 155.0 4.555110e+11
MCD 254.0 1.896580e+11
MRK 78.0 1.965430e+11
MSFT 300.0 2.253000e+12
NKE 146.0 2.302060e+11
PG 157.0 3.759090e+11
CRM 214.0 2.110360e+11
TRV 172.0 4.139000e+10
UNH 478.0 4.504210e+11
VZ 53.0 2.239960e+11
V 228.0 4.933530e+11
WBA 48.0 4.169600e+10
WMT 134.0 3.727260e+11
In [27]:
df.loc['AAPL']
Out[27]:
Exchange              NASDAQ
Industry                  IT
State             California
Price                  173.0
MarketCap    2820000000000.0
AvgVol           103560000.0
Name: AAPL, dtype: object
In [28]:
df[0:3]
Out[28]:
Exchange Industry State Price MarketCap AvgVol
Symbol
MMM NYSE Conglomerate Minnesota 157.0 8.985700e+10 2690000.0
AXP NYSE Financial New York 198.0 1.503370e+11 4330000.0
AMGN NASDAQ Biopharmaceutical California 224.0 1.249590e+11 3110000.0
In [29]:
df.iloc[0:3, [0, 2, 3]]
Out[29]:
Exchange State Price
Symbol
MMM NYSE Minnesota 157.0
AXP NYSE New York 198.0
AMGN NASDAQ California 224.0
In [30]:
df.loc['AAPL', 'State']
Out[30]:
'California'
In [31]:
df.loc[['AAPL', 'MMM'], ['State', 'Industry']]
Out[31]:
State Industry
Symbol
AAPL California IT
MMM Minnesota Conglomerate
In [32]:
df[df.Exchange == 'NASDAQ']
Out[32]:
Exchange Industry State Price MarketCap AvgVol
Symbol
AMGN NASDAQ Biopharmaceutical California 224.0 1.249590e+11 3110000.0
AAPL NASDAQ IT California 173.0 2.820000e+12 103560000.0
CSCO NASDAQ IT California 54.0 2.288900e+11 23770000.0
HON NASDAQ Conglomerate North Carolina 190.0 1.299830e+11 3450000.0
INTC NASDAQ Semiconductor California 48.0 1.972480e+11 36540000.0
MSFT NASDAQ IT Washington 300.0 2.253000e+12 35580000.0
WBA NASDAQ Retailing Illinois 48.0 4.169600e+10 6620000.0

Sort¶

In [33]:
df.sort_index(axis=0, ascending=True) # also try 1, False
Out[33]:
Exchange Industry State Price MarketCap AvgVol
Symbol
AAPL NASDAQ IT California 173.0 2.820000e+12 103560000.0
AMGN NASDAQ Biopharmaceutical California 224.0 1.249590e+11 3110000.0
AXP NYSE Financial New York 198.0 1.503370e+11 4330000.0
BA NYSE Aerospace Illinois 218.0 1.269370e+11 10350000.0
CAT NYSE Construction Illinois 203.0 1.100330e+11 3450000.0
CRM NYSE IT California 214.0 2.110360e+11 7120000.0
CSCO NASDAQ IT California 54.0 2.288900e+11 23770000.0
CVX NYSE Petroleum California 134.0 2.588120e+11 12440000.0
DIS NYSE Broadcasting California 155.0 2.816880e+11 13540000.0
DOW NYSE Chemical Michigan 62.0 4.551300e+10 5700000.0
GS NYSE Financial New York 364.0 1.214020e+11 3150000.0
HD NYSE Home Improvement Georgia 353.0 3.688880e+11 4310000.0
HON NASDAQ Conglomerate North Carolina 190.0 1.299830e+11 3450000.0
IBM NYSE IT New York 130.0 1.164680e+11 5920000.0
INTC NASDAQ Semiconductor California 48.0 1.972480e+11 36540000.0
JNJ NYSE Drug New Jersey 167.0 4.404600e+11 7940000.0
JPM NYSE Financial New York 155.0 4.555110e+11 14440000.0
KO NYSE Soft Drink Georgia 61.0 2.634360e+11 19550000.0
MCD NYSE Food Illinois 254.0 1.896580e+11 2810000.0
MMM NYSE Conglomerate Minnesota 157.0 8.985700e+10 2690000.0
MRK NYSE Drug New Jersey 78.0 1.965430e+11 14070000.0
MSFT NASDAQ IT Washington 300.0 2.253000e+12 35580000.0
NKE NYSE Apparel Oregon 146.0 2.302060e+11 5890000.0
PG NYSE Consumer Ohio 157.0 3.759090e+11 8610000.0
TRV NYSE Insurance New York 172.0 4.139000e+10 1580000.0
UNH NYSE Health Minnesota 478.0 4.504210e+11 3370000.0
V NYSE Financial California 228.0 4.933530e+11 10880000.0
VZ NYSE Telecom New York 53.0 2.239960e+11 23190000.0
WBA NASDAQ Retailing Illinois 48.0 4.169600e+10 6620000.0
WMT NYSE Retailing Arkansas 134.0 3.727260e+11 9630000.0
In [34]:
df.sort_values(by=['Industry', 'State'], axis=0, ascending=False) # try [False, True]
Out[34]:
Exchange Industry State Price MarketCap AvgVol
Symbol
VZ NYSE Telecom New York 53.0 2.239960e+11 23190000.0
KO NYSE Soft Drink Georgia 61.0 2.634360e+11 19550000.0
INTC NASDAQ Semiconductor California 48.0 1.972480e+11 36540000.0
WBA NASDAQ Retailing Illinois 48.0 4.169600e+10 6620000.0
WMT NYSE Retailing Arkansas 134.0 3.727260e+11 9630000.0
CVX NYSE Petroleum California 134.0 2.588120e+11 12440000.0
TRV NYSE Insurance New York 172.0 4.139000e+10 1580000.0
MSFT NASDAQ IT Washington 300.0 2.253000e+12 35580000.0
IBM NYSE IT New York 130.0 1.164680e+11 5920000.0
AAPL NASDAQ IT California 173.0 2.820000e+12 103560000.0
CSCO NASDAQ IT California 54.0 2.288900e+11 23770000.0
CRM NYSE IT California 214.0 2.110360e+11 7120000.0
HD NYSE Home Improvement Georgia 353.0 3.688880e+11 4310000.0
UNH NYSE Health Minnesota 478.0 4.504210e+11 3370000.0
MCD NYSE Food Illinois 254.0 1.896580e+11 2810000.0
AXP NYSE Financial New York 198.0 1.503370e+11 4330000.0
GS NYSE Financial New York 364.0 1.214020e+11 3150000.0
JPM NYSE Financial New York 155.0 4.555110e+11 14440000.0
V NYSE Financial California 228.0 4.933530e+11 10880000.0
JNJ NYSE Drug New Jersey 167.0 4.404600e+11 7940000.0
MRK NYSE Drug New Jersey 78.0 1.965430e+11 14070000.0
PG NYSE Consumer Ohio 157.0 3.759090e+11 8610000.0
CAT NYSE Construction Illinois 203.0 1.100330e+11 3450000.0
HON NASDAQ Conglomerate North Carolina 190.0 1.299830e+11 3450000.0
MMM NYSE Conglomerate Minnesota 157.0 8.985700e+10 2690000.0
DOW NYSE Chemical Michigan 62.0 4.551300e+10 5700000.0
DIS NYSE Broadcasting California 155.0 2.816880e+11 13540000.0
AMGN NASDAQ Biopharmaceutical California 224.0 1.249590e+11 3110000.0
NKE NYSE Apparel Oregon 146.0 2.302060e+11 5890000.0
BA NYSE Aerospace Illinois 218.0 1.269370e+11 10350000.0

Add to a DataFrame¶

In [35]:
groceries['weight'] = (8, 0.3, 1.0)
groceries
Out[35]:
item price n weight
0 milk 3.5 2 8.0
1 apples 0.4 5 0.3
2 bread 3.1 1 1.0
In [36]:
new_row = pd.DataFrame({'item':'popcorn', 'price':3, 'n':1, 'weight':2}, index=[0])
new_row
Out[36]:
item price n weight
0 popcorn 3 1 2
In [37]:
pd.concat([groceries, new_row], ignore_index=True) # ignore to reset second 0
Out[37]:
item price n weight
0 milk 3.5 2 8.0
1 apples 0.4 5 0.3
2 bread 3.1 1 1.0
3 popcorn 3.0 1 2.0

Examples¶

In [38]:
# average Price of NASDAQ stocks
df[df.Exchange == 'NASDAQ'].Price.mean() # try NYSE
Out[38]:
148.14285714285714
In [39]:
# highest Price among the NASDAQ stocks with AvgVol under 10 million
df[(df.Exchange == 'NASDAQ') & (df.AvgVol < 1e7)].Price.max()
Out[39]:
224.0
In [40]:
# average MarketCap of 3 highest-Price stocks
df.sort_values(by='Price', ascending=False)[0:3].MarketCap.mean()
Out[40]:
313570333333.3333