# Importing Required Libraries
import pandas as pd
import gzip
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use(['bmh']) # bmh Styling used for Visulization
# Defining the functions to read the data into dataframes
def parse(path):
g = gzip.open(path, 'rb')
for l in g:
yield eval(l)
def getDF(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
# Creating Reviews Music Instruments Dataframe
df_musI = getDF('reviews_Musical_Instruments.json.gz')
# Creating meta Music Instruments Dataframe
df_musI_meta = getDF('meta_Musical_Instruments.json.gz')
# Merging the two dataframes together
df_musI_merged = df_musI.merge(df_musI_meta, how='inner', on='asin')
# Viewing first 5 rows of the merged dataset
df_musI_merged.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | title | price | imUrl | salesRank | categories | related | brand | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A1YS9MDZP93857 | 0006428320 | John Taylor | [0, 0] | The portfolio is fine except for the fact that... | 3.0 | Parts missing | 1394496000 | 03 11, 2014 | Six Sonatas For Two Flutes Or Violins, Volume ... | 17.95 | http://ecx.images-amazon.com/images/I/41EpRmh8... | {'Musical Instruments': 207315} | [[Musical Instruments, Instrument Accessories,... | NaN | NaN | NaN |
1 | A3TS466QBAWB9D | 0014072149 | Silver Pencil | [0, 0] | If you are a serious violin student on a budge... | 5.0 | Perform it with a friend, today! | 1370476800 | 06 6, 2013 | Double Concerto in D Minor By Johann Sebastian... | 18.77 | http://ecx.images-amazon.com/images/I/41m6ygCq... | {'Musical Instruments': 94593} | [[Musical Instruments]] | {'also_viewed': ['B0058DK7RA'], 'buy_after_vie... | Composer: J.S. Bach.Peters Edition.For two vio... | |
2 | A3BUDYITWUSIS7 | 0041291905 | joyce gabriel cornett | [0, 0] | This is and excellent edition and perfectly tr... | 5.0 | Vivalldi's Four Seasons | 1381708800 | 10 14, 2013 | Hal Leonard Vivaldi Four Seasons for Piano (Or... | NaN | http://ecx.images-amazon.com/images/I/41maAqSO... | {'Musical Instruments': 222972} | [[Musical Instruments, Instrument Accessories,... | NaN | NaN | Vivaldi's famous set of four violin concertos ... |
3 | A19K10Z0D2NTZK | 0041913574 | TexasCowboy | [0, 0] | Perfect for someone who is an opera fan or a w... | 5.0 | Full score: voice and orchestra | 1285200000 | 09 23, 2010 | Aida: Opera in Quattro Atti, Partitura -- Aida... | 49.99 | http://ecx.images-amazon.com/images/I/513kRMv%... | NaN | [[Musical Instruments]] | NaN | NaN | 444 pages. \nReprint of corrected and revised ... |
4 | A14X336IB4JD89 | 0201891859 | dfjm53 | [0, 1] | How many Nocturnes does it contain? All of the... | 1.0 | Unable to determine contents | 1350432000 | 10 17, 2012 | Nocturnes | NaN | http://ecx.images-amazon.com/images/I/41SXCAzs... | {'Musical Instruments': 171871} | [[Musical Instruments, Instrument Accessories,... | NaN | NaN | NaN |
# Checking the Number of rows nad columns
df_musI_merged.shape
(500176, 17)
# Importing review Digital Music data into dataframe
df_DigMus = getDF('reviews_Digital_Music.json.gz')
# Importing meta Digital Music data into dataframe
df_DigMus_meta = getDF('meta_Digital_Music.json.gz')
# Merging the dataframe
df_DigMus_merged = df_DigMus.merge(df_DigMus_meta, how='inner', on='asin')
df_DigMus_merged.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | title | price | imUrl | related | salesRank | categories | description | brand | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A2EFCYXHNK06IS | 5555991584 | Abigail Perkins "Abby "Reads Too Much"... | [4, 5] | The anthemic title track begins "The Memo... | 5.0 | Enya Experiments And Succeeds | 978480000 | 01 3, 2001 | Memory of Trees | 9.49 | http://ecx.images-amazon.com/images/I/51b5WDjd... | {'also_bought': ['B000002LRT', 'B000002LRR', '... | {'Music': 939190} | [[CDs & Vinyl, New Age, Celtic New Age], [CDs ... | NaN | NaN |
1 | A1WR23ER5HMAA9 | 5555991584 | AKB | [1, 1] | Just when I thought Enya couldn't possibly get... | 5.0 | How to improve upon perfection. | 953424000 | 03 19, 2000 | Memory of Trees | 9.49 | http://ecx.images-amazon.com/images/I/51b5WDjd... | {'also_bought': ['B000002LRT', 'B000002LRR', '... | {'Music': 939190} | [[CDs & Vinyl, New Age, Celtic New Age], [CDs ... | NaN | NaN |
2 | A2IR4Q0GPAFJKW | 5555991584 | Alexander | [0, 0] | Nice CD for easy listening. My husband and I ... | 4.0 | Good listening. | 1393545600 | 02 28, 2014 | Memory of Trees | 9.49 | http://ecx.images-amazon.com/images/I/51b5WDjd... | {'also_bought': ['B000002LRT', 'B000002LRR', '... | {'Music': 939190} | [[CDs & Vinyl, New Age, Celtic New Age], [CDs ... | NaN | NaN |
3 | A2V0KUVAB9HSYO | 5555991584 | Alison Hight | [0, 1] | I really liked this CD, especially the differe... | 4.0 | Loved It | 966124800 | 08 13, 2000 | Memory of Trees | 9.49 | http://ecx.images-amazon.com/images/I/51b5WDjd... | {'also_bought': ['B000002LRT', 'B000002LRR', '... | {'Music': 939190} | [[CDs & Vinyl, New Age, Celtic New Age], [CDs ... | NaN | NaN |
4 | A1J0GL9HCA7ELW | 5555991584 | Al the Pal "Al the Pal" | [3, 3] | Enya's richly chorded style has smitten me onc... | 5.0 | Another Hauntingly Beautiful Collection of Songs | 1007683200 | 12 7, 2001 | Memory of Trees | 9.49 | http://ecx.images-amazon.com/images/I/51b5WDjd... | {'also_bought': ['B000002LRT', 'B000002LRR', '... | {'Music': 939190} | [[CDs & Vinyl, New Age, Celtic New Age], [CDs ... | NaN | NaN |
df_DigMus_merged.shape
(836006, 17)
# Importing reviews_Baby data into dataframe
df_Baby = getDF('reviews_Baby.json.gz')
# Importing meta_Baby data into dataframe
df_Baby_meta = getDF('meta_Baby.json.gz')
# Merging the two dataframes
df_Baby_merged = df_Baby.merge(df_Baby_meta, how='inner', on='asin')
df_Baby_merged.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | categories | description | title | price | imUrl | brand | related | salesRank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A28O3NP6WR5517 | 0188399313 | Jennifer gymer | [0, 0] | They work very well. Easy to clean, we wash th... | 5.0 | These bottles are great! | 1369612800 | 05 27, 2013 | [[Baby]] | Wee-Go Glass baby bottles by LifeFactory (Baby... | Lifefactory 4oz BPA Free Glass Baby Bottles - ... | 69.99 | http://ecx.images-amazon.com/images/I/41Swthpd... | Lifefactory | {'also_bought': ['B002SG7K7A', 'B003CJSXW8', '... | NaN |
1 | AX0M1Z6ZWO52J | 0188399399 | Ash M. | [1, 1] | it came early and was not disappointed. i love... | 5.0 | perfect | 1365465600 | 04 9, 2013 | [[Baby]] | The Planet Wise Wipe PouchTM features our pate... | Planetwise Wipe Pouch | 10.95 | http://ecx.images-amazon.com/images/I/61x8h9u6... | NaN | {'also_bought': ['B005WWI0DA', 'B005WWIMGA', '... | NaN |
2 | A1KD7N84L7NIUT | 0188399518 | Buffy Fan | [0, 0] | I ended up with a variety of different brands ... | 4.0 | Good cotton wipes | 1392336000 | 02 14, 2014 | [[Baby]] | The Planet Wise Flannel Wipes are 10 super sof... | Planetwise Flannel Wipes | 15.95 | http://ecx.images-amazon.com/images/I/41otjnA4... | Planet Wise | {'also_bought': ['B00G96N3YY', 'B003XSEV2O', '... | NaN |
3 | A29CUDEIF4X1UO | 0188399518 | J. D. Solbach | [1, 1] | These flannel wipes are OK, but in my opinion ... | 3.0 | Sending them back. | 1373241600 | 07 8, 2013 | [[Baby]] | The Planet Wise Flannel Wipes are 10 super sof... | Planetwise Flannel Wipes | 15.95 | http://ecx.images-amazon.com/images/I/41otjnA4... | Planet Wise | {'also_bought': ['B00G96N3YY', 'B003XSEV2O', '... | NaN |
4 | A32592TYN6C9EM | 0316967297 | Amazon Customer | [0, 0] | Cute quilt, the colors are perfect and my litt... | 4.0 | Very cute | 1378425600 | 09 6, 2013 | [[Baby]] | Hand crafted set includes 1 full quilt (76x86 ... | Annas Dream Full Quilt with 2 Shams | 109.95 | http://ecx.images-amazon.com/images/I/51%2BZ1%... | NaN | {'also_viewed': ['B009LTER3W', 'B00575TI5Q', '... | NaN |
df_Baby_merged.shape
(915446, 17)
# Importing reviews_Patio_Lawn_and_Garden data into dataframe
df_Patio = getDF('reviews_Patio_Lawn_and_Garden.json.gz')
# Importing meta_Patio_Lawn_and_Garden data into dataframe
df_Patio_meta = getDF('meta_Patio_Lawn_and_Garden.json.gz')
# Merging the two dataframes
df_Patio_merged = df_Patio.merge(df_Patio_meta, how='inner', on='asin')
df_Patio_merged.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | description | title | imUrl | related | salesRank | categories | price | brand | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A2VNYWOPJ13AFP | 0981850006 | Amazon Customer "carringt0n" | [6, 7] | This was a gift for my other husband. He's ma... | 5.0 | Delish | 1259798400 | 12 3, 2009 | Steven Raichlen's Best of Barbecue Primal Gril... | Primal Grill with Steven Raichlen, Volume One | http://ecx.images-amazon.com/images/I/51cNn5Dl... | {'also_viewed': ['0761149430', '0761120149', '... | {'Movies & TV': 231134} | [[Patio, Lawn & Garden, Grills & Outdoor Cooki... | NaN | NaN |
1 | A20DWVV8HML3AW | 0981850006 | Cyndy | [0, 0] | My husband rarely asks for anything specific, ... | 5.0 | Just what my husband wanted | 1371081600 | 06 13, 2013 | Steven Raichlen's Best of Barbecue Primal Gril... | Primal Grill with Steven Raichlen, Volume One | http://ecx.images-amazon.com/images/I/51cNn5Dl... | {'also_viewed': ['0761149430', '0761120149', '... | {'Movies & TV': 231134} | [[Patio, Lawn & Garden, Grills & Outdoor Cooki... | NaN | NaN |
2 | A3RVP3YBYYOPRH | 0981850006 | John Warren | [9, 11] | This guy knows his stuff. I love the seriously... | 5.0 | Love This Show! | 1257984000 | 11 12, 2009 | Steven Raichlen's Best of Barbecue Primal Gril... | Primal Grill with Steven Raichlen, Volume One | http://ecx.images-amazon.com/images/I/51cNn5Dl... | {'also_viewed': ['0761149430', '0761120149', '... | {'Movies & TV': 231134} | [[Patio, Lawn & Garden, Grills & Outdoor Cooki... | NaN | NaN |
3 | A28XY55TP3Q90O | 0981850006 | JSG "Lover of Learning" | [1, 1] | THIS CAN BE FOUND AT: [...] FOR $19.99, also a... | 5.0 | BUY ELSEWHERE, TOO EXPENSIVE | 1314144000 | 08 24, 2011 | Steven Raichlen's Best of Barbecue Primal Gril... | Primal Grill with Steven Raichlen, Volume One | http://ecx.images-amazon.com/images/I/51cNn5Dl... | {'also_viewed': ['0761149430', '0761120149', '... | {'Movies & TV': 231134} | [[Patio, Lawn & Garden, Grills & Outdoor Cooki... | NaN | NaN |
4 | A3VZW1BGUQO0V3 | 0981850006 | Marc G. Belisle "Allswell2001" | [2, 2] | Quite good particularly for inspiration (DVD g... | 5.0 | Quite good but I *much* prefer the cookbooks | 1308268800 | 06 17, 2011 | Steven Raichlen's Best of Barbecue Primal Gril... | Primal Grill with Steven Raichlen, Volume One | http://ecx.images-amazon.com/images/I/51cNn5Dl... | {'also_viewed': ['0761149430', '0761120149', '... | {'Movies & TV': 231134} | [[Patio, Lawn & Garden, Grills & Outdoor Cooki... | NaN | NaN |
df_Patio_merged.shape
(993490, 17)
# Importing reviews_Pet_Supplies data into dataframe
df_PetSup = getDF('reviews_Pet_Supplies.json.gz')
# Importing meta_Pet_Supplies data into dataframe
df_PetSup_meta = getDF('meta_Pet_Supplies.json.gz')
# Merging the dataframe
df_PetSup_merged = df_PetSup.merge(df_PetSup_meta, how='inner', on='asin')
df_PetSup_merged.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | related | title | price | salesRank | imUrl | brand | categories | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A3PG0KS1YE8MR4 | 0615553605 | Big Sky Gal | [3, 3] | Recipes are easy and the dogs love them. I wou... | 5.0 | Great little book! | 1354838400 | 12 7, 2012 | {'also_bought': ['0836269195', '1931993807', '... | Pet Qwerks Treat Cookbook with Cutters | 10.95 | {'Pet Supplies': 34134} | http://ecx.images-amazon.com/images/I/41yroPoJ... | Pet Qwerks | [[Pet Supplies, Dogs, Toys]] | Know exactly what your dog is eating with 101 ... |
1 | A363P047LR5XI6 | 0615553605 | Miguel Giraldo | [2, 2] | I received the book in the mail yesterday, the... | 4.0 | Nice little book | 1373932800 | 07 16, 2013 | {'also_bought': ['0836269195', '1931993807', '... | Pet Qwerks Treat Cookbook with Cutters | 10.95 | {'Pet Supplies': 34134} | http://ecx.images-amazon.com/images/I/41yroPoJ... | Pet Qwerks | [[Pet Supplies, Dogs, Toys]] | Know exactly what your dog is eating with 101 ... |
2 | ABZ8CQXD42H4 | 0615553605 | Ross Lockhart | [0, 0] | Not so good. Some of the recipes are not what... | 1.0 | Did not like the book | 1386028800 | 12 3, 2013 | {'also_bought': ['0836269195', '1931993807', '... | Pet Qwerks Treat Cookbook with Cutters | 10.95 | {'Pet Supplies': 34134} | http://ecx.images-amazon.com/images/I/41yroPoJ... | Pet Qwerks | [[Pet Supplies, Dogs, Toys]] | Know exactly what your dog is eating with 101 ... |
3 | A3J8QW1MV1OP01 | 0615583474 | Beth Light | [4, 4] | This is a fun, interactive book that incorpora... | 5.0 | This is a Precious Book for Precious Cats | 1347321600 | 09 11, 2012 | {'also_viewed': ['1937406679', '1936319284', '... | It's A Cat's Life | NaN | {'Pet Supplies': 452691} | http://ecx.images-amazon.com/images/I/51%2BBp0... | NaN | [[Pet Supplies]] | An Interactive Cat Book, by Howie "The World's... |
4 | A3ISA8Z0NB0ILH | 0615583474 | George D. Marshall "george1767" | [3, 3] | 5.0 | Video review of Howie and Friends | 1358899200 | 01 23, 2013 | {'also_viewed': ['1937406679', '1936319284', '... | It's A Cat's Life | NaN | {'Pet Supplies': 452691} | http://ecx.images-amazon.com/images/I/51%2BBp0... | NaN | [[Pet Supplies]] | An Interactive Cat Book, by Howie "The World's... |
df_PetSup_merged.shape
(1235316, 17)
df = [df_musI_merged, df_DigMus_merged,df_Baby_merged,df_Patio_merged,df_PetSup_merged]
df_1 = pd.concat(df, ignore_index=True)
df_1.shape
(4480434, 17)
df_1.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | title | price | imUrl | salesRank | categories | related | brand | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A1YS9MDZP93857 | 0006428320 | John Taylor | [0, 0] | The portfolio is fine except for the fact that... | 3.0 | Parts missing | 1394496000 | 03 11, 2014 | Six Sonatas For Two Flutes Or Violins, Volume ... | 17.95 | http://ecx.images-amazon.com/images/I/41EpRmh8... | {'Musical Instruments': 207315} | [[Musical Instruments, Instrument Accessories,... | NaN | NaN | NaN |
1 | A3TS466QBAWB9D | 0014072149 | Silver Pencil | [0, 0] | If you are a serious violin student on a budge... | 5.0 | Perform it with a friend, today! | 1370476800 | 06 6, 2013 | Double Concerto in D Minor By Johann Sebastian... | 18.77 | http://ecx.images-amazon.com/images/I/41m6ygCq... | {'Musical Instruments': 94593} | [[Musical Instruments]] | {'also_viewed': ['B0058DK7RA'], 'buy_after_vie... | Composer: J.S. Bach.Peters Edition.For two vio... | |
2 | A3BUDYITWUSIS7 | 0041291905 | joyce gabriel cornett | [0, 0] | This is and excellent edition and perfectly tr... | 5.0 | Vivalldi's Four Seasons | 1381708800 | 10 14, 2013 | Hal Leonard Vivaldi Four Seasons for Piano (Or... | NaN | http://ecx.images-amazon.com/images/I/41maAqSO... | {'Musical Instruments': 222972} | [[Musical Instruments, Instrument Accessories,... | NaN | NaN | Vivaldi's famous set of four violin concertos ... |
3 | A19K10Z0D2NTZK | 0041913574 | TexasCowboy | [0, 0] | Perfect for someone who is an opera fan or a w... | 5.0 | Full score: voice and orchestra | 1285200000 | 09 23, 2010 | Aida: Opera in Quattro Atti, Partitura -- Aida... | 49.99 | http://ecx.images-amazon.com/images/I/513kRMv%... | NaN | [[Musical Instruments]] | NaN | NaN | 444 pages. \nReprint of corrected and revised ... |
4 | A14X336IB4JD89 | 0201891859 | dfjm53 | [0, 1] | How many Nocturnes does it contain? All of the... | 1.0 | Unable to determine contents | 1350432000 | 10 17, 2012 | Nocturnes | NaN | http://ecx.images-amazon.com/images/I/41SXCAzs... | {'Musical Instruments': 171871} | [[Musical Instruments, Instrument Accessories,... | NaN | NaN | NaN |
# Viewing dataframe columns and their datatypes
df_1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4480434 entries, 0 to 4480433 Data columns (total 17 columns): # Column Dtype --- ------ ----- 0 reviewerID object 1 asin object 2 reviewerName object 3 helpful object 4 reviewText object 5 overall float64 6 summary object 7 unixReviewTime int64 8 reviewTime object 9 title object 10 price float64 11 imUrl object 12 salesRank object 13 categories object 14 related object 15 brand object 16 description object dtypes: float64(2), int64(1), object(14) memory usage: 581.1+ MB
# Viewing Descriptive Information about the dataframe
df_1.describe(include ='all')
# Checking for missing values
df_1.isnull().sum()
reviewerID 0 asin 0 reviewerName 29582 helpful 0 reviewText 0 overall 0 summary 0 unixReviewTime 0 reviewTime 0 title 655521 price 550814 imUrl 6038 salesRank 1526239 categories 0 related 271179 brand 1879143 description 644149 dtype: int64
# Dropping all rows with missing values
df_cleaned = df_1.dropna()
# Checking for missing values after dropping missing values
df_cleaned.isnull().sum()
reviewerID 0 asin 0 reviewerName 0 helpful 0 reviewText 0 overall 0 summary 0 unixReviewTime 0 reviewTime 0 title 0 price 0 imUrl 0 salesRank 0 categories 0 related 0 brand 0 description 0 dtype: int64
df_cleaned.shape
(1894261, 17)
# Filtering the dataframe to get relevant columns
df_filtered = df_cleaned.filter(items=['asin', 'overall', 'title', 'price', 'categories', 'brand','reviewTime'])
df_filtered.head()
asin | overall | title | price | categories | brand | reviewTime | |
---|---|---|---|---|---|---|---|
1 | 0014072149 | 5.0 | Double Concerto in D Minor By Johann Sebastian... | 18.77 | [[Musical Instruments]] | 06 6, 2013 | |
24 | 0634061801 | 5.0 | 50 Licks Blues Style - DVD | 12.49 | [[Movies & TV, TV], [Musical Instruments, Inst... | Hal Leonard | 11 6, 2010 |
25 | 0634061801 | 4.0 | 50 Licks Blues Style - DVD | 12.49 | [[Movies & TV, TV], [Musical Instruments, Inst... | Hal Leonard | 08 25, 2005 |
26 | 0634061801 | 5.0 | 50 Licks Blues Style - DVD | 12.49 | [[Movies & TV, TV], [Musical Instruments, Inst... | Hal Leonard | 09 27, 2007 |
27 | 0634061801 | 4.0 | 50 Licks Blues Style - DVD | 12.49 | [[Movies & TV, TV], [Musical Instruments, Inst... | Hal Leonard | 04 8, 2010 |
# Grouping dataframe by asin column using grouby with aggregate function mean
df_avg = df_filtered.groupby(['asin'],as_index=False).mean()
df_avg
asin | overall | price | |
---|---|---|---|
0 | 0014072149 | 5.000000 | 18.77 |
1 | 0615553605 | 3.333333 | 10.95 |
2 | 0634061801 | 4.363636 | 12.49 |
3 | 0739042726 | 2.600000 | 22.36 |
4 | 0739045067 | 4.230769 | 27.14 |
... | ... | ... | ... |
111960 | B00L3R83U8 | 1.000000 | 123.99 |
111961 | B00L75HFOQ | 5.000000 | 10.50 |
111962 | B00LB14BFW | 5.000000 | 7.42 |
111963 | B00LB14BHA | 5.000000 | 21.20 |
111964 | SMLRBIMX03 | 4.500000 | 229.00 |
111965 rows × 3 columns
# Grouping dataframe by asin and brand column using grouby with aggregate function count
df_count = df_filtered.groupby(['asin','brand'],as_index=False).count()
df_count
asin | brand | overall | title | price | categories | reviewTime | |
---|---|---|---|---|---|---|---|
0 | 0014072149 | 1 | 1 | 1 | 1 | 1 | |
1 | 0615553605 | Pet Qwerks | 3 | 3 | 3 | 3 | 3 |
2 | 0634061801 | Hal Leonard | 11 | 11 | 11 | 11 | 11 |
3 | 0739042726 | Alfred | 5 | 5 | 5 | 5 | 5 |
4 | 0739045067 | Alfred | 13 | 13 | 13 | 13 | 13 |
... | ... | ... | ... | ... | ... | ... | ... |
111960 | B00L3R83U8 | Blazer | 1 | 1 | 1 | 1 | 1 |
111961 | B00L75HFOQ | TopStage | 1 | 1 | 1 | 1 | 1 |
111962 | B00LB14BFW | Nylabone | 1 | 1 | 1 | 1 | 1 |
111963 | B00LB14BHA | Nylabone | 1 | 1 | 1 | 1 | 1 |
111964 | SMLRBIMX03 | LR Baggs | 2 | 2 | 2 | 2 | 2 |
111965 rows × 7 columns
# Sorting df_count dataframe according to reviews count in descending order
df_sort = df_count.sort_values(by="overall",ascending=False)
# Extracting top 5 products with most reviews
df_top5 = df_sort.head()
df_top5
asin | brand | overall | title | price | categories | reviewTime | |
---|---|---|---|---|---|---|---|
65017 | B0040QOYZ2 | Furminator | 4107 | 4107 | 4107 | 4107 | 4107 |
29728 | B000ULAP4U | Audio-Technica | 3515 | 3515 | 3515 | 3515 | 3515 |
2578 | B00008DFGY | Frontline | 3493 | 3493 | 3493 | 3493 | 3493 |
22388 | B000HCLLMM | Classic Accessories | 3178 | 3178 | 3178 | 3178 | 3178 |
7494 | B0002DK2DU | Omega Paw | 3079 | 3079 | 3079 | 3079 | 3079 |
# Creating a pivot table to visualize the top 5 products with most reviews
df_top5_pivot = df_top5.pivot(index='asin',columns='brand',values='overall') #Creating Pivot Table
df_top5_pivot
brand | Audio-Technica | Classic Accessories | Frontline | Furminator | Omega Paw |
---|---|---|---|---|---|
asin | |||||
B00008DFGY | NaN | NaN | 3493.0 | NaN | NaN |
B0002DK2DU | NaN | NaN | NaN | NaN | 3079.0 |
B000HCLLMM | NaN | 3178.0 | NaN | NaN | NaN |
B000ULAP4U | 3515.0 | NaN | NaN | NaN | NaN |
B0040QOYZ2 | NaN | NaN | NaN | 4107.0 | NaN |
# Creating a visual for the top 5 products
df_top5_pivot.plot(kind='barh',figsize=(10,5))
plt.title('Top 5 Products with most reviews',fontsize = 20)
plt.ylabel('ASIN',fontsize = 15)
plt.xlabel('Number of Ratings',fontsize = 15)
plt.legend(fontsize = 10)
plt.show()
# Creating dataframe to create linechart
df_most_reviews = df_cleaned.filter(items=['asin','overall', 'brand','reviewTime'])
# Filtering and creating Year column to show Number of 5 star reviews every year of the top 5 products
df_top5 = df_most_reviews.loc[(df_most_reviews['asin'] == 'B00008DFGY') | (df_most_reviews['asin'] == 'B0002DK2DU') | (df_most_reviews['asin'] == 'B000ULAP4U') | (df_most_reviews['asin'] == 'B000HCLLMM') | (df_most_reviews['asin'] == 'B0040QOYZ2')]
df_top5 = df_top5.loc[(df_top5['overall'] == 5.0)]
df_top5['reviewTime'] = df_top5.reviewTime.str.replace(',','')
df_top5['reviewTime'] = df_top5.reviewTime.str.replace(' ','')
df_top5['reviewTime'] = pd.to_datetime(df_top5['reviewTime'], format='%m%d%Y')
df_top5['Year'] = pd.DatetimeIndex(df_top5['reviewTime']).year
df_review = df_top5.groupby(['asin','Year'],as_index=False).count()
df_review_pivot = df_review.pivot(index='Year',columns='asin',values='overall') #Creating Pivot Table
df_review_pivot = df_review_pivot.fillna(0)
df_review_pivot
asin | B00008DFGY | B0002DK2DU | B000HCLLMM | B000ULAP4U | B0040QOYZ2 |
---|---|---|---|---|---|
Year | |||||
2003 | 1.0 | NaN | NaN | NaN | NaN |
2005 | 7.0 | 8.0 | NaN | NaN | NaN |
2006 | 7.0 | 32.0 | NaN | NaN | NaN |
2007 | 37.0 | 96.0 | 41.0 | 1.0 | NaN |
2008 | 149.0 | 101.0 | 68.0 | 6.0 | NaN |
2009 | 215.0 | 88.0 | 77.0 | 32.0 | NaN |
2010 | 198.0 | 190.0 | 168.0 | 85.0 | 3.0 |
2011 | 216.0 | 341.0 | 226.0 | 255.0 | 251.0 |
2012 | 328.0 | 315.0 | 405.0 | 618.0 | 556.0 |
2013 | 752.0 | 349.0 | 929.0 | 1131.0 | 1855.0 |
2014 | 278.0 | 278.0 | 481.0 | 694.0 | 584.0 |
# Creating linechart to show 5 start ratings of top 5 product over the years
df_review_pivot.plot(kind='line',figsize=(10,5))
plt.title('Top 5 Products 5 Star Rating Over the Years',fontsize = 20)
plt.ylabel('Number of 5 Stars',fontsize = 15)
plt.xlabel('Year',fontsize = 15)
plt.legend(fontsize = 10)
plt.show()
df_cleaned.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | title | price | imUrl | salesRank | categories | related | brand | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | A3TS466QBAWB9D | 0014072149 | Silver Pencil | [0, 0] | If you are a serious violin student on a budge... | 5.0 | Perform it with a friend, today! | 1370476800 | 06 6, 2013 | Double Concerto in D Minor By Johann Sebastian... | 18.77 | http://ecx.images-amazon.com/images/I/41m6ygCq... | {'Musical Instruments': 94593} | [[Musical Instruments]] | {'also_viewed': ['B0058DK7RA'], 'buy_after_vie... | Composer: J.S. Bach.Peters Edition.For two vio... | |
24 | A1D1S69ZMKYCCV | 0634061801 | daniel | [0, 0] | great book... you should be aware of blues sca... | 5.0 | product review | 1289001600 | 11 6, 2010 | 50 Licks Blues Style - DVD | 12.49 | http://ecx.images-amazon.com/images/I/51zPYfTt... | {'Books': 108256} | [[Movies & TV, TV], [Musical Instruments, Inst... | {'also_bought': ['B001QIVESC', 'B005T5OBO2', '... | Hal Leonard | Unlock the secrets of blues guitar! Ace sessio... |
25 | A7R79JLQ5Q28M | 0634061801 | Eddie | [0, 0] | While I did not enjoy Mat's "Blues Rock guitar... | 4.0 | Good collection of blues licks | 1124928000 | 08 25, 2005 | 50 Licks Blues Style - DVD | 12.49 | http://ecx.images-amazon.com/images/I/51zPYfTt... | {'Books': 108256} | [[Movies & TV, TV], [Musical Instruments, Inst... | {'also_bought': ['B001QIVESC', 'B005T5OBO2', '... | Hal Leonard | Unlock the secrets of blues guitar! Ace sessio... |
26 | A3T7SXCKJUYXT3 | 0634061801 | K. D. Shultz | [4, 4] | This was a very pleasant suprize i really enjo... | 5.0 | Buy It ...and get out of a rut | 1190851200 | 09 27, 2007 | 50 Licks Blues Style - DVD | 12.49 | http://ecx.images-amazon.com/images/I/51zPYfTt... | {'Books': 108256} | [[Movies & TV, TV], [Musical Instruments, Inst... | {'also_bought': ['B001QIVESC', 'B005T5OBO2', '... | Hal Leonard | Unlock the secrets of blues guitar! Ace sessio... |
27 | A1JZM0YIYHL6JW | 0634061801 | Leon C. Rodriguez "bluehillcinema" | [3, 3] | I've been playing the blues for a long time so... | 4.0 | Well executed and professionally done | 1270684800 | 04 8, 2010 | 50 Licks Blues Style - DVD | 12.49 | http://ecx.images-amazon.com/images/I/51zPYfTt... | {'Books': 108256} | [[Movies & TV, TV], [Musical Instruments, Inst... | {'also_bought': ['B001QIVESC', 'B005T5OBO2', '... | Hal Leonard | Unlock the secrets of blues guitar! Ace sessio... |
df_his1 = df_musI_merged.filter(items=['asin','reviewTime'])
df_his1 = df_his1.groupby(['reviewTime'],as_index=False).count()
df_his1['Month'] = pd.DatetimeIndex(df_his1['reviewTime']).month
df_his1 = df_his1.filter(items=['Month','asin'])
df_his1
df_his2 = df_DigMus_merged.filter(items=['asin','reviewTime'])
df_his2 = df_his2.groupby(['reviewTime'],as_index=False).count()
df_his2['Month'] = pd.DatetimeIndex(df_his2['reviewTime']).month
df_his2 = df_his2.filter(items=['Month','asin'])
df_his2
Month | asin | |
---|---|---|
0 | 1 | 6 |
1 | 1 | 37 |
2 | 1 | 33 |
3 | 1 | 42 |
4 | 1 | 47 |
... | ... | ... |
5936 | 12 | 117 |
5937 | 12 | 101 |
5938 | 12 | 140 |
5939 | 12 | 1309 |
5940 | 12 | 597 |
5941 rows × 2 columns
df_his3 = df_Baby_merged.filter(items=['asin','reviewTime'])
df_his3 = df_his3.groupby(['reviewTime'],as_index=False).count()
df_his3['Month'] = pd.DatetimeIndex(df_his3['reviewTime']).month
df_his3 = df_his3.filter(items=['Month','asin'])
df_his3
Month | asin | |
---|---|---|
0 | 1 | 5 |
1 | 1 | 6 |
2 | 1 | 9 |
3 | 1 | 33 |
4 | 1 | 40 |
... | ... | ... |
4864 | 12 | 120 |
4865 | 12 | 191 |
4866 | 12 | 411 |
4867 | 12 | 646 |
4868 | 12 | 1307 |
4869 rows × 2 columns
df_his4 = df_Patio_merged.filter(items=['asin','reviewTime'])
df_his4 = df_his4.groupby(['reviewTime'],as_index=False).count()
df_his4['Month'] = pd.DatetimeIndex(df_his4['reviewTime']).month
df_his4 = df_his4.filter(items=['Month','asin'])
df_his4
Month | asin | |
---|---|---|
0 | 1 | 1 |
1 | 1 | 3 |
2 | 1 | 5 |
3 | 1 | 8 |
4 | 1 | 8 |
... | ... | ... |
4924 | 12 | 60 |
4925 | 12 | 125 |
4926 | 12 | 254 |
4927 | 12 | 669 |
4928 | 12 | 1116 |
4929 rows × 2 columns
df_his5 = df_PetSup_merged.filter(items=['asin','reviewTime'])
df_his5 = df_his5.groupby(['reviewTime'],as_index=False).count()
df_his5['Month'] = pd.DatetimeIndex(df_his5['reviewTime']).month
df_his5 = df_his5.filter(items=['Month','asin'])
df_his5
Month | asin | |
---|---|---|
0 | 1 | 1 |
1 | 1 | 1 |
2 | 1 | 3 |
3 | 1 | 4 |
4 | 1 | 13 |
... | ... | ... |
3895 | 12 | 125 |
3896 | 12 | 246 |
3897 | 12 | 420 |
3898 | 12 | 1119 |
3899 | 12 | 1891 |
3900 rows × 2 columns
# Plotting the Histogram of the 5 created datasets
x1 = df_his1['asin']
x2 = df_his2['asin']
x3 = df_his3['asin']
x4 = df_his4['asin']
x5 = df_his5['asin']
plt.figure(figsize=(8,10))
plt.title("Stacked Histogram of the 5 datasets with number of products every month")
plt.hist([x1,x2,x3,x4,x5], bins=[1,2,3,4,5,6,7,8,9,10,11,12], stacked=True, density=False,label=['Musical Instruments','Digital Music','Baby','Patio','Pet Supply'])
plt.ylabel('Number of Products each Month')
plt.xlabel('Months')
plt.legend()
plt.show()