Pandas and sql - II

In [1]:
import numpy as np
import pandas as pd

# read the data from the downloaded CSV file.
tips = pd.read_csv('tips.csv')
tips.head(4)
Out[1]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
In [2]:
from sqlalchemy import create_engine

#Create an in-memory SQLite database.
engine = create_engine('sqlite://', echo=False)
tips.to_sql('tips', con=engine)

SELECT

In [3]:
engine.execute("SELECT count(*) FROM tips").fetchall()
Out[3]:
[(244,)]
In [4]:
tips.shape
Out[4]:
(244, 7)
In [5]:
engine.execute("SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;").fetchall()
Out[5]:
[(16.99, 1.01, 'No', 'Dinner'),
 (10.34, 1.66, 'No', 'Dinner'),
 (21.01, 3.5, 'No', 'Dinner'),
 (23.68, 3.31, 'No', 'Dinner'),
 (24.59, 3.61, 'No', 'Dinner')]
In [6]:
tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
Out[6]:
total_bill tip smoker time
0 16.99 1.01 No Dinner
1 10.34 1.66 No Dinner
2 21.01 3.50 No Dinner
3 23.68 3.31 No Dinner
4 24.59 3.61 No Dinner
In [7]:
engine.execute("SELECT * FROM tips WHERE time = 'Dinner' LIMIT 5;").fetchall()
Out[7]:
[(0, 16.99, 1.01, 'Female', 'No', 'Sun', 'Dinner', 2),
 (1, 10.34, 1.66, 'Male', 'No', 'Sun', 'Dinner', 3),
 (2, 21.01, 3.5, 'Male', 'No', 'Sun', 'Dinner', 3),
 (3, 23.68, 3.31, 'Male', 'No', 'Sun', 'Dinner', 2),
 (4, 24.59, 3.61, 'Female', 'No', 'Sun', 'Dinner', 4)]
In [8]:
tips[tips['time'] == 'Dinner'].head(5)
Out[8]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [9]:
is_dinner = tips['time'] == 'Dinner'
is_dinner.value_counts()
Out[9]:
True     176
False     68
Name: time, dtype: int64
In [10]:
tips[is_dinner].head(5)
Out[10]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [11]:
engine.execute("SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00;").fetchall()
Out[11]:
[(23, 39.42, 7.58, 'Male', 'No', 'Sat', 'Dinner', 4),
 (44, 30.4, 5.6, 'Male', 'No', 'Sun', 'Dinner', 4),
 (47, 32.4, 6.0, 'Male', 'No', 'Sun', 'Dinner', 4),
 (52, 34.81, 5.2, 'Female', 'No', 'Sun', 'Dinner', 4),
 (59, 48.27, 6.73, 'Male', 'No', 'Sat', 'Dinner', 4),
 (116, 29.93, 5.07, 'Male', 'No', 'Sun', 'Dinner', 4),
 (155, 29.85, 5.14, 'Female', 'No', 'Sun', 'Dinner', 5),
 (170, 50.81, 10.0, 'Male', 'Yes', 'Sat', 'Dinner', 3),
 (172, 7.25, 5.15, 'Male', 'Yes', 'Sun', 'Dinner', 2),
 (181, 23.33, 5.65, 'Male', 'Yes', 'Sun', 'Dinner', 2),
 (183, 23.17, 6.5, 'Male', 'Yes', 'Sun', 'Dinner', 4),
 (211, 25.89, 5.16, 'Male', 'Yes', 'Sat', 'Dinner', 4),
 (212, 48.33, 9.0, 'Male', 'No', 'Sat', 'Dinner', 4),
 (214, 28.17, 6.5, 'Female', 'Yes', 'Sat', 'Dinner', 3),
 (239, 29.03, 5.92, 'Male', 'No', 'Sat', 'Dinner', 3)]
In [12]:
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
Out[12]:
total_bill tip sex smoker day time size
23 39.42 7.58 Male No Sat Dinner 4
44 30.40 5.60 Male No Sun Dinner 4
47 32.40 6.00 Male No Sun Dinner 4
52 34.81 5.20 Female No Sun Dinner 4
59 48.27 6.73 Male No Sat Dinner 4
116 29.93 5.07 Male No Sun Dinner 4
155 29.85 5.14 Female No Sun Dinner 5
170 50.81 10.00 Male Yes Sat Dinner 3
172 7.25 5.15 Male Yes Sun Dinner 2
181 23.33 5.65 Male Yes Sun Dinner 2
183 23.17 6.50 Male Yes Sun Dinner 4
211 25.89 5.16 Male Yes Sat Dinner 4
212 48.33 9.00 Male No Sat Dinner 4
214 28.17 6.50 Female Yes Sat Dinner 3
239 29.03 5.92 Male No Sat Dinner 3
In [13]:
engine.execute("SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;").fetchall()
Out[13]:
[(59, 48.27, 6.73, 'Male', 'No', 'Sat', 'Dinner', 4),
 (125, 29.8, 4.2, 'Female', 'No', 'Thur', 'Lunch', 6),
 (141, 34.3, 6.7, 'Male', 'No', 'Thur', 'Lunch', 6),
 (142, 41.19, 5.0, 'Male', 'No', 'Thur', 'Lunch', 5),
 (143, 27.05, 5.0, 'Female', 'No', 'Thur', 'Lunch', 6),
 (155, 29.85, 5.14, 'Female', 'No', 'Sun', 'Dinner', 5),
 (156, 48.17, 5.0, 'Male', 'No', 'Sun', 'Dinner', 6),
 (170, 50.81, 10.0, 'Male', 'Yes', 'Sat', 'Dinner', 3),
 (182, 45.35, 3.5, 'Male', 'Yes', 'Sun', 'Dinner', 3),
 (185, 20.69, 5.0, 'Male', 'No', 'Sun', 'Dinner', 5),
 (187, 30.46, 2.0, 'Male', 'Yes', 'Sun', 'Dinner', 5),
 (212, 48.33, 9.0, 'Male', 'No', 'Sat', 'Dinner', 4),
 (216, 28.15, 3.0, 'Male', 'Yes', 'Sat', 'Dinner', 5)]
In [14]:
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
Out[14]:
total_bill tip sex smoker day time size
59 48.27 6.73 Male No Sat Dinner 4
125 29.80 4.20 Female No Thur Lunch 6
141 34.30 6.70 Male No Thur Lunch 6
142 41.19 5.00 Male No Thur Lunch 5
143 27.05 5.00 Female No Thur Lunch 6
155 29.85 5.14 Female No Sun Dinner 5
156 48.17 5.00 Male No Sun Dinner 6
170 50.81 10.00 Male Yes Sat Dinner 3
182 45.35 3.50 Male Yes Sun Dinner 3
185 20.69 5.00 Male No Sun Dinner 5
187 30.46 2.00 Male Yes Sun Dinner 5
212 48.33 9.00 Male No Sat Dinner 4
216 28.15 3.00 Male Yes Sat Dinner 5
In [15]:
frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'],
                      'col2': ['F', np.NaN, 'G', 'H', 'I']})
frame.to_sql('frame', con=engine)
frame
Out[15]:
col1 col2
0 A F
1 B NaN
2 NaN G
3 C H
4 D I
In [16]:
engine.execute("SELECT * FROM frame WHERE col2 IS NULL;").fetchall()
Out[16]:
[(1, 'B', None)]
In [17]:
frame[frame['col2'].isna()]
Out[17]:
col1 col2
1 B NaN
In [18]:
engine.execute("SELECT * FROM frame WHERE col1 IS NOT NULL;").fetchall()
Out[18]:
[(0, 'A', 'F'), (1, 'B', None), (3, 'C', 'H'), (4, 'D', 'I')]
In [19]:
frame[frame['col1'].notna()]
Out[19]:
col1 col2
0 A F
1 B NaN
3 C H
4 D I
In [20]:
engine.execute("SELECT sex, count(*) FROM tips GROUP BY sex;").fetchall()
Out[20]:
[('Female', 87), ('Male', 157)]
In [21]:
tips.groupby('sex').size()
Out[21]:
sex
Female     87
Male      157
dtype: int64
In [22]:
#count() returns the number of not null records / column
tips.groupby('sex').count()
Out[22]:
total_bill tip smoker day time size
sex
Female 87 87 87 87 87 87
Male 157 157 157 157 157 157
In [23]:
tips.groupby('sex')['total_bill'].count()
Out[23]:
sex
Female     87
Male      157
Name: total_bill, dtype: int64
In [24]:
engine.execute("SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day;").fetchall()
Out[24]:
[('Fri', 2.734736842105263, 19),
 ('Sat', 2.993103448275862, 87),
 ('Sun', 3.255131578947369, 76),
 ('Thur', 2.771451612903226, 62)]
In [25]:
tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
Out[25]:
tip day
day
Fri 2.734737 19
Sat 2.993103 87
Sun 3.255132 76
Thur 2.771452 62
In [26]:
engine.execute("SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day;").fetchall()
Out[26]:
[('No', 'Fri', 4, 2.8125),
 ('No', 'Sat', 45, 3.102888888888889),
 ('No', 'Sun', 57, 3.1678947368421055),
 ('No', 'Thur', 45, 2.673777777777778),
 ('Yes', 'Fri', 15, 2.714),
 ('Yes', 'Sat', 42, 2.8754761904761903),
 ('Yes', 'Sun', 19, 3.5168421052631573),
 ('Yes', 'Thur', 17, 3.0299999999999994)]
In [27]:
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})
Out[27]:
tip
size mean
smoker day
No Fri 4.0 2.812500
Sat 45.0 3.102889
Sun 57.0 3.167895
Thur 45.0 2.673778
Yes Fri 15.0 2.714000
Sat 42.0 2.875476
Sun 19.0 3.516842
Thur 17.0 3.030000
In [28]:
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
                    'value': np.random.randn(4)})
 
df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'],
                    'value': np.random.randn(4)})

df1.to_sql('df1', con=engine)
df2.to_sql('df2', con=engine)
In [29]:
engine.execute("SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;").fetchall()
Out[29]:
[(1, 'B', 0.20341211387204833, 0, 'B', -2.339283854295101),
 (3, 'D', 1.3084518213776402, 1, 'D', 0.9050996536461622),
 (3, 'D', 1.3084518213776402, 2, 'D', -0.3964236312984381)]
In [32]:
pd.merge(df1, df2, on='key')
Out[32]:
key value_x value_y
0 B 0.203412 -2.339284
1 D 1.308452 0.905100
2 D 1.308452 -0.396424
In [33]:
indexed_df2 = df2.set_index('key')
pd.merge(df1, indexed_df2, left_on='key', right_index=True)
Out[33]:
key value_x value_y
1 B 0.203412 -2.339284
3 D 1.308452 0.905100
3 D 1.308452 -0.396424
In [34]:
#show all records from df1
engine.execute("SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key;").fetchall()
Out[34]:
[(0, 'A', -0.05722100507450193, None, None, None),
 (1, 'B', 0.20341211387204833, 0, 'B', -2.339283854295101),
 (2, 'C', -0.20711703665853892, None, None, None),
 (3, 'D', 1.3084518213776402, 1, 'D', 0.9050996536461622),
 (3, 'D', 1.3084518213776402, 2, 'D', -0.3964236312984381)]
In [35]:
#show all records from df1
pd.merge(df1, df2, on='key', how='left')
Out[35]:
key value_x value_y
0 A -0.057221 NaN
1 B 0.203412 -2.339284
2 C -0.207117 NaN
3 D 1.308452 0.905100
4 D 1.308452 -0.396424
In [37]:
#show all records from df2
#engine.execute("SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key;").fetchall()
#not supported by sqlite
In [38]:
# show all records from df2
pd.merge(df1, df2, on='key', how='right')
Out[38]:
key value_x value_y
0 B 0.203412 -2.339284
1 D 1.308452 0.905100
2 D 1.308452 -0.396424
3 E NaN -1.927822
In [40]:
#show all records from both tables
#engine.execute("SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key;").fetchall()
#not supported by sqlite
In [41]:
# show all records from both frames
pd.merge(df1, df2, on='key', how='outer')
Out[41]:
key value_x value_y
0 A -0.057221 NaN
1 B 0.203412 -2.339284
2 C -0.207117 NaN
3 D 1.308452 0.905100
4 D 1.308452 -0.396424
5 E NaN -1.927822
In [45]:
df11 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'],
                    'rank': range(1, 4)})
df12 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'],
                    'rank': [1, 4, 5]})
df11.to_sql('df11', con=engine)
df12.to_sql('df12', con=engine)
In [46]:
engine.execute("SELECT city, rank FROM df11 UNION ALL SELECT city, rank FROM df12;").fetchall()
Out[46]:
[('Chicago', 1),
 ('San Francisco', 2),
 ('New York City', 3),
 ('Chicago', 1),
 ('Boston', 4),
 ('Los Angeles', 5)]
In [47]:
pd.concat([df11, df12])
Out[47]:
city rank
0 Chicago 1
1 San Francisco 2
2 New York City 3
0 Chicago 1
1 Boston 4
2 Los Angeles 5
In [48]:
engine.execute("SELECT city, rank FROM df11 UNION SELECT city, rank FROM df12;").fetchall()
Out[48]:
[('Boston', 4),
 ('Chicago', 1),
 ('Los Angeles', 5),
 ('New York City', 3),
 ('San Francisco', 2)]
In [50]:
pd.concat([df11, df12]).drop_duplicates()
Out[50]:
city rank
0 Chicago 1
1 San Francisco 2
2 New York City 3
1 Boston 4
2 Los Angeles 5
In [51]:
engine.execute("SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;").fetchall()
Out[51]:
[(183, 23.17, 6.5, 'Male', 'Yes', 'Sun', 'Dinner', 4),
 (214, 28.17, 6.5, 'Female', 'Yes', 'Sat', 'Dinner', 3),
 (47, 32.4, 6.0, 'Male', 'No', 'Sun', 'Dinner', 4),
 (239, 29.03, 5.92, 'Male', 'No', 'Sat', 'Dinner', 3),
 (88, 24.71, 5.85, 'Male', 'No', 'Thur', 'Lunch', 2),
 (181, 23.33, 5.65, 'Male', 'Yes', 'Sun', 'Dinner', 2),
 (44, 30.4, 5.6, 'Male', 'No', 'Sun', 'Dinner', 4),
 (52, 34.81, 5.2, 'Female', 'No', 'Sun', 'Dinner', 4),
 (85, 34.83, 5.17, 'Female', 'No', 'Thur', 'Lunch', 4),
 (211, 25.89, 5.16, 'Male', 'Yes', 'Sat', 'Dinner', 4)]
In [52]:
tips.nlargest(10 + 5, columns='tip').tail(10)
Out[52]:
total_bill tip sex smoker day time size
183 23.17 6.50 Male Yes Sun Dinner 4
214 28.17 6.50 Female Yes Sat Dinner 3
47 32.40 6.00 Male No Sun Dinner 4
239 29.03 5.92 Male No Sat Dinner 3
88 24.71 5.85 Male No Thur Lunch 2
181 23.33 5.65 Male Yes Sun Dinner 2
44 30.40 5.60 Male No Sun Dinner 4
52 34.81 5.20 Female No Sun Dinner 4
85 34.83 5.17 Female No Thur Lunch 4
211 25.89 5.16 Male Yes Sat Dinner 4
In [54]:
engine.execute("UPDATE tips SET tip = tip*2 WHERE tip < 2;")
Out[54]:
<sqlalchemy.engine.result.ResultProxy at 0x7f38a68a0e10>
In [55]:
tips.loc[tips['tip'] < 2, 'tip'] *= 2
In [56]:
engine.execute("DELETE FROM tips WHERE tip > 9;")
Out[56]:
<sqlalchemy.engine.result.ResultProxy at 0x7f38a68a0e80>
In [57]:
tips = tips.loc[tips['tip'] <= 9]