Pandas and sql - I

In [1]:
import numpy as np
import pandas as pd

# read the data from the downloaded CSV file.
data = pd.read_csv('../../../input/winemag-data-130k-v2.csv')
data.head(4)
Out[1]:
Unnamed: 0 country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
0 0 Italy Aromas include tropical fruit, broom, brimston... Vulkà Bianco 87 NaN Sicily & Sardinia Etna NaN Kerin O’Keefe @kerinokeefe Nicosia 2013 Vulkà Bianco (Etna) White Blend Nicosia
1 1 Portugal This is ripe and fruity, a wine that is smooth... Avidagos 87 15.0 Douro NaN NaN Roger Voss @vossroger Quinta dos Avidagos 2011 Avidagos Red (Douro) Portuguese Red Quinta dos Avidagos
2 2 US Tart and snappy, the flavors of lime flesh and... NaN 87 14.0 Oregon Willamette Valley Willamette Valley Paul Gregutt @paulgwine Rainstorm 2013 Pinot Gris (Willamette Valley) Pinot Gris Rainstorm
3 3 US Pineapple rind, lemon pith and orange blossom ... Reserve Late Harvest 87 13.0 Michigan Lake Michigan Shore NaN Alexander Peartree NaN St. Julian 2013 Reserve Late Harvest Riesling ... Riesling St. Julian
In [2]:
data.columns
Out[2]:
Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')
In [3]:
data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
data.columns[0]
Out[3]:
'id'
In [4]:
from sqlalchemy import create_engine

#Create an in-memory SQLite database.
engine = create_engine('sqlite://', echo=False)
data.to_sql('data', con=engine)
engine.execute("SELECT count(*) FROM data").fetchall()
Out[4]:
[(129971,)]
In [5]:
import sqlite3 as sql

#create sqlite database (file) from csv
conn = sql.connect("test.db")
#dataframe to sqlite 
data.to_sql("data", conn, if_exists='append', index=False)
#pandas query sqlite database
result = pd.read_sql_query("select count(*) from data",conn)
result
Out[5]:
count(*)
0 649855
In [6]:
#sqlite query
cursor = conn.cursor()
cursor.execute("select count(*) from data")
cursor.fetchall()
Out[6]:
[(649855,)]

SELECT

SQL Pandas
select * from data data
select * from data limit 2 data.head(2)
select id from data where country = 'China' data[data.id == 'country'].id
select distinct country from data data.country.unique()
In [7]:
engine.execute("select * from data limit 2").fetchall()
Out[7]:
[(0, 0, 'Italy', "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.", 'Vulkà Bianco', 87, None, 'Sicily & Sardinia', 'Etna', None, 'Kerin O’Keefe', '@kerinokeefe', 'Nicosia 2013 Vulkà Bianco  (Etna)', 'White Blend', 'Nicosia'),
 (1, 1, 'Portugal', "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.", 'Avidagos', 87, 15.0, 'Douro', None, None, 'Roger Voss', '@vossroger', 'Quinta dos Avidagos 2011 Avidagos Red (Douro)', 'Portuguese Red', 'Quinta dos Avidagos')]
In [8]:
data.head(2)
Out[8]:
id country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
0 0 Italy Aromas include tropical fruit, broom, brimston... Vulkà Bianco 87 NaN Sicily & Sardinia Etna NaN Kerin O’Keefe @kerinokeefe Nicosia 2013 Vulkà Bianco (Etna) White Blend Nicosia
1 1 Portugal This is ripe and fruity, a wine that is smooth... Avidagos 87 15.0 Douro NaN NaN Roger Voss @vossroger Quinta dos Avidagos 2011 Avidagos Red (Douro) Portuguese Red Quinta dos Avidagos
In [9]:
engine.execute("select id from data where country = 'China'").fetchall()
Out[9]:
[(109989,)]
In [10]:
data[data.country == 'China'].id
Out[10]:
109989    109989
Name: id, dtype: int64
In [11]:
engine.execute("SELECT distinct(country) FROM data").fetchall()
Out[11]:
[('Italy',),
 ('Portugal',),
 ('US',),
 ('Spain',),
 ('France',),
 ('Germany',),
 ('Argentina',),
 ('Chile',),
 ('Australia',),
 ('Austria',),
 ('South Africa',),
 ('New Zealand',),
 ('Israel',),
 ('Hungary',),
 ('Greece',),
 ('Romania',),
 ('Mexico',),
 ('Canada',),
 (None,),
 ('Turkey',),
 ('Czech Republic',),
 ('Slovenia',),
 ('Luxembourg',),
 ('Croatia',),
 ('Georgia',),
 ('Uruguay',),
 ('England',),
 ('Lebanon',),
 ('Serbia',),
 ('Brazil',),
 ('Moldova',),
 ('Morocco',),
 ('Peru',),
 ('India',),
 ('Bulgaria',),
 ('Cyprus',),
 ('Armenia',),
 ('Switzerland',),
 ('Bosnia and Herzegovina',),
 ('Ukraine',),
 ('Slovakia',),
 ('Macedonia',),
 ('China',),
 ('Egypt',)]
In [12]:
data.country.unique()
Out[12]:
array(['Italy', 'Portugal', 'US', 'Spain', 'France', 'Germany',
       'Argentina', 'Chile', 'Australia', 'Austria', 'South Africa',
       'New Zealand', 'Israel', 'Hungary', 'Greece', 'Romania', 'Mexico',
       'Canada', nan, 'Turkey', 'Czech Republic', 'Slovenia',
       'Luxembourg', 'Croatia', 'Georgia', 'Uruguay', 'England',
       'Lebanon', 'Serbia', 'Brazil', 'Moldova', 'Morocco', 'Peru',
       'India', 'Bulgaria', 'Cyprus', 'Armenia', 'Switzerland',
       'Bosnia and Herzegovina', 'Ukraine', 'Slovakia', 'Macedonia',
       'China', 'Egypt'], dtype=object)
In [13]:
engine.execute("select * from data where province = 'Oregon' and price = 10 limit 3").fetchall()
Out[13]:
[(4872, 4872, 'US', "Veteran producer Oak Knoll remains one of Oregon's most underrated value wineries. This full-bodied, super-fruity Riesling offers bright, off-dry, well-balanced flavors of citrus, honey and tea.", 'Semi-Sweet', 87, 10.0, 'Oregon', 'Willamette Valley', 'Willamette Valley', 'Paul Gregutt', '@paulgwine\xa0', 'Oak Knoll 2013 Semi-Sweet Riesling (Willamette Valley)', 'Riesling', 'Oak Knoll'),
 (7488, 7488, 'US', "This is an odd duck for Gewürz; lacking any of the floral character generally associated with that grape. It's lemony and slightly sweet, with good acid. It makes a pleasant, if generic white wine to accompany spicy noodles or light seafood.", None, 83, 10.0, 'Oregon', 'Southern Oregon', 'Southern Oregon', 'Paul Gregutt', '@paulgwine\xa0', 'Bridgeview 2006 Gewürztraminer (Southern Oregon)', 'Gewürztraminer', 'Bridgeview'),
 (17796, 17796, 'US', "This popular blend of seven red grapes is dark and strongly flavored with a streak of vanilla. There's nothing fancy here, just big, vanilla-soaked flavors, at an affordable price.", 'Rogue', 85, 10.0, 'Oregon', 'Oregon', 'Oregon Other', 'Paul Gregutt', '@paulgwine\xa0', 'Valley View NV Rogue Red (Oregon)', 'Red Blend', 'Valley View')]
In [14]:
data[(data.province=='Oregon') & (data.price==10)].head(3)
Out[14]:
id country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
4872 4872 US Veteran producer Oak Knoll remains one of Oreg... Semi-Sweet 87 10.0 Oregon Willamette Valley Willamette Valley Paul Gregutt @paulgwine Oak Knoll 2013 Semi-Sweet Riesling (Willamette... Riesling Oak Knoll
7488 7488 US This is an odd duck for Gewürz; lacking any of... NaN 83 10.0 Oregon Southern Oregon Southern Oregon Paul Gregutt @paulgwine Bridgeview 2006 Gewürztraminer (Southern Oregon) Gewürztraminer Bridgeview
17796 17796 US This popular blend of seven red grapes is dark... Rogue 85 10.0 Oregon Oregon Oregon Other Paul Gregutt @paulgwine Valley View NV Rogue Red (Oregon) Red Blend Valley View
In [15]:
engine.execute("select id, country, description from data limit 3").fetchall()
Out[15]:
[(0, 'Italy', "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."),
 (1, 'Portugal', "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016."),
 (2, 'US', 'Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.')]
In [16]:
data[['id', 'country', 'description']].head(3)
Out[16]:
id country description
0 0 Italy Aromas include tropical fruit, broom, brimston...
1 1 Portugal This is ripe and fruity, a wine that is smooth...
2 2 US Tart and snappy, the flavors of lime flesh and...
In [17]:
engine.execute("select id, country, description from data where province = 'Oregon' and price = 10 limit 3").fetchall()
Out[17]:
[(4872, 'US', "Veteran producer Oak Knoll remains one of Oregon's most underrated value wineries. This full-bodied, super-fruity Riesling offers bright, off-dry, well-balanced flavors of citrus, honey and tea."),
 (7488, 'US', "This is an odd duck for Gewürz; lacking any of the floral character generally associated with that grape. It's lemony and slightly sweet, with good acid. It makes a pleasant, if generic white wine to accompany spicy noodles or light seafood."),
 (17796, 'US', "This popular blend of seven red grapes is dark and strongly flavored with a streak of vanilla. There's nothing fancy here, just big, vanilla-soaked flavors, at an affordable price.")]
In [18]:
data[(data.province=='Oregon') & (data.price==10)][['id', 'country', 'description']].head(3)
Out[18]:
id country description
4872 4872 US Veteran producer Oak Knoll remains one of Oreg...
7488 7488 US This is an odd duck for Gewürz; lacking any of...
17796 17796 US This popular blend of seven red grapes is dark...
In [19]:
engine.execute("select * from data where winery = 'Nicosia' order by variety desc limit 3").fetchall()
Out[19]:
[(0, 0, 'Italy', "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.", 'Vulkà Bianco', 87, None, 'Sicily & Sardinia', 'Etna', None, 'Kerin O’Keefe', '@kerinokeefe', 'Nicosia 2013 Vulkà Bianco  (Etna)', 'White Blend', 'Nicosia'),
 (12551, 12551, 'Italy', 'A delicate floral fragrance of white flower, pear, citrus, hay and a hint of beeswax and Mediterranean herb lead the nose. The palate is polished and bright, showing green apple, citrus and mineral alongside fresh acidity.', 'Fondo Filara Bianco', 88, None, 'Sicily & Sardinia', 'Etna', None, 'Kerin O’Keefe', '@kerinokeefe', 'Nicosia 2013 Fondo Filara Bianco  (Etna)', 'White Blend', 'Nicosia'),
 (52317, 52317, 'Italy', "A blend of Carricante and Catarratto (two of Sicily's most expressive indigenous varieties), this Etna white would pair well with seafood or vegetarian dishes. It boasts focused tones of citrus and Granny Smith apple.", 'Fondo Filara', 87, 19.0, 'Sicily & Sardinia', 'Etna', None, None, None, 'Nicosia 2010 Fondo Filara  (Etna)', 'White Blend', 'Nicosia')]
In [20]:
data[(data.winery=='Nicosia')].sort_values('variety',ascending=False).head(3)
Out[20]:
id country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
0 0 Italy Aromas include tropical fruit, broom, brimston... Vulkà Bianco 87 NaN Sicily & Sardinia Etna NaN Kerin O’Keefe @kerinokeefe Nicosia 2013 Vulkà Bianco (Etna) White Blend Nicosia
12551 12551 Italy A delicate floral fragrance of white flower, p... Fondo Filara Bianco 88 NaN Sicily & Sardinia Etna NaN Kerin O’Keefe @kerinokeefe Nicosia 2013 Fondo Filara Bianco (Etna) White Blend Nicosia
52317 52317 Italy A blend of Carricante and Catarratto (two of S... Fondo Filara 87 19.0 Sicily & Sardinia Etna NaN NaN NaN Nicosia 2010 Fondo Filara (Etna) White Blend Nicosia
In [21]:
engine.execute("select * from data where province in ('Oregon','Michigan') limit 3").fetchall()
Out[21]:
[(2, 2, 'US', 'Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.', None, 87, 14.0, 'Oregon', 'Willamette Valley', 'Willamette Valley', 'Paul Gregutt', '@paulgwine\xa0', 'Rainstorm 2013 Pinot Gris (Willamette Valley)', 'Pinot Gris', 'Rainstorm'),
 (3, 3, 'US', 'Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.', 'Reserve Late Harvest', 87, 13.0, 'Michigan', 'Lake Michigan Shore', None, 'Alexander Peartree', None, 'St. Julian 2013 Reserve Late Harvest Riesling (Lake Michigan Shore)', 'Riesling', 'St. Julian'),
 (4, 4, 'US', "Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew.", "Vintner's Reserve Wild Child Block", 87, 65.0, 'Oregon', 'Willamette Valley', 'Willamette Valley', 'Paul Gregutt', '@paulgwine\xa0', "Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pinot Noir (Willamette Valley)", 'Pinot Noir', 'Sweet Cheeks')]
In [22]:
data[data.province.isin(['Oregon', 'Michigan'])].head(3)
Out[22]:
id country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
2 2 US Tart and snappy, the flavors of lime flesh and... NaN 87 14.0 Oregon Willamette Valley Willamette Valley Paul Gregutt @paulgwine Rainstorm 2013 Pinot Gris (Willamette Valley) Pinot Gris Rainstorm
3 3 US Pineapple rind, lemon pith and orange blossom ... Reserve Late Harvest 87 13.0 Michigan Lake Michigan Shore NaN Alexander Peartree NaN St. Julian 2013 Reserve Late Harvest Riesling ... Riesling St. Julian
4 4 US Much like the regular bottling from 2012, this... Vintner's Reserve Wild Child Block 87 65.0 Oregon Willamette Valley Willamette Valley Paul Gregutt @paulgwine Sweet Cheeks 2012 Vintner's Reserve Wild Child... Pinot Noir Sweet Cheeks
In [23]:
engine.execute("select * from data where province not in ('Oregon','Michigan') limit 3").fetchall()
Out[23]:
[(0, 0, 'Italy', "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.", 'Vulkà Bianco', 87, None, 'Sicily & Sardinia', 'Etna', None, 'Kerin O’Keefe', '@kerinokeefe', 'Nicosia 2013 Vulkà Bianco  (Etna)', 'White Blend', 'Nicosia'),
 (1, 1, 'Portugal', "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.", 'Avidagos', 87, 15.0, 'Douro', None, None, 'Roger Voss', '@vossroger', 'Quinta dos Avidagos 2011 Avidagos Red (Douro)', 'Portuguese Red', 'Quinta dos Avidagos'),
 (5, 5, 'Spain', 'Blackberry and raspberry aromas show a typical Navarran whiff of green herbs and, in this case, horseradish. In the mouth, this is fairly full bodied, with tomatoey acidity. Spicy, herbal flavors complement dark plum fruit, while the finish is fresh but grabby.', 'Ars In Vitro', 87, 15.0, 'Northern Spain', 'Navarra', None, 'Michael Schachner', '@wineschach', 'Tandem 2011 Ars In Vitro Tempranillo-Merlot (Navarra)', 'Tempranillo-Merlot', 'Tandem')]
In [24]:
data[~data.province.isin(['Oregon', 'Michigan'])].head(3)
Out[24]:
id country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
0 0 Italy Aromas include tropical fruit, broom, brimston... Vulkà Bianco 87 NaN Sicily & Sardinia Etna NaN Kerin O’Keefe @kerinokeefe Nicosia 2013 Vulkà Bianco (Etna) White Blend Nicosia
1 1 Portugal This is ripe and fruity, a wine that is smooth... Avidagos 87 15.0 Douro NaN NaN Roger Voss @vossroger Quinta dos Avidagos 2011 Avidagos Red (Douro) Portuguese Red Quinta dos Avidagos
5 5 Spain Blackberry and raspberry aromas show a typical... Ars In Vitro 87 15.0 Northern Spain Navarra NaN Michael Schachner @wineschach Tandem 2011 Ars In Vitro Tempranillo-Merlot (N... Tempranillo-Merlot Tandem
In [25]:
engine.execute("select region_1, variety, count(*) from data group by region_1, variety order by variety limit 10").fetchall()
Out[25]:
[(None, None, 1),
 ('Côtes du Marmandais', 'Abouriou', 2),
 ('Russian River Valley', 'Abouriou', 1),
 (None, 'Agiorgitiko', 63),
 ('Aglianico del Beneventano', 'Aglianico', 2),
 ('Aglianico del Taburno', 'Aglianico', 13),
 ('Aglianico del Vulture', 'Aglianico', 90),
 ('Amador County', 'Aglianico', 2),
 ('Basilicata', 'Aglianico', 6),
 ('Beneventano', 'Aglianico', 3)]
In [26]:
data.groupby(['region_1','variety']).size().to_frame('size').sort_values(by=['variety']).head(10)
Out[26]:
size
region_1 variety
Russian River Valley Abouriou 1
Côtes du Marmandais Abouriou 2
Mendocino County Aglianico 1
St. Helena Aglianico 1
Cilento Aglianico 2
Clear Lake Aglianico 1
Colli di Salerno Aglianico 1
Amador County Aglianico 2
Sant' Agata dei Goti Aglianico 1
Sannio Aglianico 3
In [27]:
engine.execute("select country, count(country) from data where taster_name='Roger Voss' group by country having count()>1000 order by count() desc").fetchall()
Out[27]:
[('France', 18602), ('Portugal', 5658), ('Austria', 1130)]
In [28]:
data[data.taster_name == 'Roger Voss'].groupby('country').filter(lambda g: len(g) > 1000).groupby('country').size().sort_values(ascending=False)
Out[28]:
country
France      18602
Portugal     5658
Austria      1130
dtype: int64
In [29]:
#sqlite have only min and max
engine.execute("select max(price), min(price) from data").fetchall()
Out[29]:
[(3300.0, 4.0)]
In [30]:
data.agg({'price': ['min', 'max', 'mean', 'median','std']})
Out[30]:
price
min 4.000000
max 3300.000000
mean 35.363389
median 25.000000
std 41.022218
In [31]:
# finding STD 
#std = data.std(axis = 0, skipna = True)
#m = data.mean(axis = 0, skipna = True)