# Let us import all the necessary packages in order to perform our analysis.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_excel("transactions.xlsx", sheet_name = "Credit-Card-Transactions")
data
| City | Date | Card Type | Exp Type | Gender | Amount | |
|---|---|---|---|---|---|---|
| 0 | Delhi, India | 2014-10-29 | Gold | Bills | F | 82475 |
| 1 | Greater Mumbai, India | 2014-08-22 | Platinum | Bills | F | 32555 |
| 2 | Bengaluru, India | 2014-08-27 | Silver | Bills | F | 101738 |
| 3 | Greater Mumbai, India | 2014-04-12 | Signature | Bills | F | 123424 |
| 4 | Bengaluru, India | 2015-05-05 | Gold | Bills | F | 171574 |
| ... | ... | ... | ... | ... | ... | ... |
| 26047 | Kolkata, India | 2014-06-22 | Silver | Travel | F | 128191 |
| 26048 | Pune, India | 2014-08-03 | Signature | Travel | M | 246316 |
| 26049 | Hyderabad, India | 2015-01-16 | Silver | Travel | M | 265019 |
| 26050 | Kanpur, India | 2014-09-14 | Silver | Travel | M | 88174 |
| 26051 | Hyderabad, India | 2013-10-19 | Signature | Bills | M | 184410 |
26052 rows × 6 columns
type(data)
pandas.core.frame.DataFrame
data.describe()
| Amount | |
|---|---|
| count | 26052.000000 |
| mean | 156411.537425 |
| std | 103063.254287 |
| min | 1005.000000 |
| 25% | 77120.250000 |
| 50% | 153106.500000 |
| 75% | 228050.000000 |
| max | 998077.000000 |
# Lets split the city column and remove the country 'India' as we already know all the cities are from india.
data['City_Name'] = data['City'].str.split(',', 1, expand = True)
data
| City | Date | Card Type | Exp Type | Gender | Amount | City_Name | |
|---|---|---|---|---|---|---|---|
| 0 | Delhi, India | 2014-10-29 | Gold | Bills | F | 82475 | Delhi |
| 1 | Greater Mumbai, India | 2014-08-22 | Platinum | Bills | F | 32555 | Greater Mumbai |
| 2 | Bengaluru, India | 2014-08-27 | Silver | Bills | F | 101738 | Bengaluru |
| 3 | Greater Mumbai, India | 2014-04-12 | Signature | Bills | F | 123424 | Greater Mumbai |
| 4 | Bengaluru, India | 2015-05-05 | Gold | Bills | F | 171574 | Bengaluru |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 26047 | Kolkata, India | 2014-06-22 | Silver | Travel | F | 128191 | Kolkata |
| 26048 | Pune, India | 2014-08-03 | Signature | Travel | M | 246316 | Pune |
| 26049 | Hyderabad, India | 2015-01-16 | Silver | Travel | M | 265019 | Hyderabad |
| 26050 | Kanpur, India | 2014-09-14 | Silver | Travel | M | 88174 | Kanpur |
| 26051 | Hyderabad, India | 2013-10-19 | Signature | Bills | M | 184410 | Hyderabad |
26052 rows × 7 columns
# Drop the City column, and move city column to the second position
data = data.drop(['City'], axis = 1)
new_order = [5,1,2,3,0,4]
data = data[data.columns[new_order]]
data
| City_Name | Card Type | Exp Type | Gender | Date | Amount | |
|---|---|---|---|---|---|---|
| 0 | Delhi | Gold | Bills | F | 2014-10-29 | 82475 |
| 1 | Greater Mumbai | Platinum | Bills | F | 2014-08-22 | 32555 |
| 2 | Bengaluru | Silver | Bills | F | 2014-08-27 | 101738 |
| 3 | Greater Mumbai | Signature | Bills | F | 2014-04-12 | 123424 |
| 4 | Bengaluru | Gold | Bills | F | 2015-05-05 | 171574 |
| ... | ... | ... | ... | ... | ... | ... |
| 26047 | Kolkata | Silver | Travel | F | 2014-06-22 | 128191 |
| 26048 | Pune | Signature | Travel | M | 2014-08-03 | 246316 |
| 26049 | Hyderabad | Silver | Travel | M | 2015-01-16 | 265019 |
| 26050 | Kanpur | Silver | Travel | M | 2014-09-14 | 88174 |
| 26051 | Hyderabad | Signature | Bills | M | 2013-10-19 | 184410 |
26052 rows × 6 columns
# Select the specific rows for removing duplicates the data.
select = [0,4,5]
newdata = data[data.columns[select]]
# Sorting the data by amount in descending order.
newdata = newdata.sort_values(by = ['Amount'], ascending = False)
# Specifying the start and end dates.
start_2013 = '2013-1-1'
end_2013 = '2013-12-31'
after_start = newdata["Date"] >= start_2013
before_end = newdata["Date"] <= end_2013
between_dates = after_start & before_end
#Filtering the data to only show the rows in 2013.
filtered_dates_2013 = newdata.loc[between_dates]
#Removing duplicate cityes and displaying the top 10.
filtered_dates_2013_new = filtered_dates_2013.groupby(by=["City_Name"]).sum()
filtered_dates_2013_new.sort_values(by = ['Amount'], ascending = False).head(5).T
| City_Name | Bengaluru | Greater Mumbai | Ahmedabad | Delhi | Kanpur |
|---|---|---|---|---|---|
| Amount | 84999287 | 84735652 | 83291837 | 82126961 | 19640338 |
# Specifying the start and end dates.
start_2014 = '2014-1-1'
end_2014 = '2014-12-31'
after_start = newdata["Date"] >= start_2014
before_end = newdata["Date"] <= end_2014
between_dates = after_start & before_end
#Filtering the data to only show the rows in 2014.
filtered_dates_2014 = newdata.loc[between_dates]
#Removing duplicate cityes and displaying the top 10.
filtered_dates_2014 = filtered_dates_2014.groupby(by=["City_Name"]).sum()
filtered_dates_2014.sort_values(by = ['Amount'], ascending = False).head(5).T
| City_Name | Bengaluru | Greater Mumbai | Ahmedabad | Delhi | Hyderabad |
|---|---|---|---|---|---|
| Amount | 351482095 | 349709415 | 348289187 | 329591186 | 71066005 |
# Specifying the start and end dates.
start_2015 = '2015-1-1'
end_2015 = '2015-12-31'
after_start = newdata["Date"] >= start_2015
before_end = newdata["Date"] <= end_2015
between_dates = after_start & before_end
#Filtering the data to only show the rows in 2014.
filtered_dates_2015 = newdata.loc[between_dates]
#Removing duplicate cityes and displaying the top 10.
filtered_dates_2015 = filtered_dates_2015.groupby(by=["City_Name"]).sum()
filtered_dates_2015.sort_values(by = ['Amount'], ascending = False).head(5).T
| City_Name | Delhi | Greater Mumbai | Ahmedabad | Bengaluru | Kolkata |
|---|---|---|---|---|---|
| Amount | 145211065 | 142306409 | 136213286 | 135845357 | 30918731 |
# Lets filter the dataset by year first, pick the specific columns from the dataset
select = [3,4,5]
newdata1 = data[data.columns[select]]
# Specifying the start and end dates.
start_2013 = '2013-1-1'
end_2013 = '2013-12-30'
after_start = newdata1["Date"] >= start_2013
before_end = newdata1["Date"] <= end_2013
between_dates = after_start & before_end
#Filtering the data to only show the rows in 2013.
filtered_dates_2013 = newdata1.loc[between_dates]
filtered_dates_2013
| Gender | Date | Amount | |
|---|---|---|---|
| 19 | F | 2013-11-09 | 735566 |
| 23 | F | 2013-10-11 | 668568 |
| 32 | F | 2013-11-24 | 227702 |
| 40 | F | 2013-10-27 | 678842 |
| 47 | F | 2013-11-04 | 313626 |
| ... | ... | ... | ... |
| 26011 | F | 2013-11-13 | 52609 |
| 26028 | F | 2013-11-20 | 263420 |
| 26040 | F | 2013-10-26 | 163352 |
| 26046 | M | 2013-10-09 | 198903 |
| 26051 | M | 2013-10-19 | 184410 |
2548 rows × 3 columns
After a close look at the available dataset we dont seem to have any data on the first three quarters of 2013 and the last two quarters of 2015.
# Specifying the start and end dates of q1
q1start_2013 = '2013-1-1'
q1end_2013 = '2013-3-31'
# Specifying the start and end dates of q2
q2start_2013 = '2013-4-1'
q2end_2013 = '2013-6-30'
# Specifying the start and end dates of q3
q3start_2013 = '2013-7-1'
q3end_2013 = '2013-9-30'
# Specifying the start and end dates of q4
q4start_2013 = '2013-10-1'
q4end_2013 = '2013-12-31'
q1after_start = newdata1["Date"] >= q1start_2013
q1before_end = newdata1["Date"] <= q1end_2013
q2after_start = newdata1["Date"] >= q2start_2013
q2before_end = newdata1["Date"] <= q2end_2013
q3after_start = newdata1["Date"] >= q3start_2013
q3before_end = newdata1["Date"] <= q3end_2013
q4after_start = newdata1["Date"] >= q4start_2013
q4before_end = newdata1["Date"] <= q4end_2013
q1between_dates = q1after_start & q1before_end
q2between_dates = q2after_start & q2before_end
q3between_dates = q3after_start & q3before_end
q4between_dates = q4after_start & q4before_end
#Filtering the data to only show the rows in each quarter.
filtered_dates_2013_q1 = newdata1.loc[q1between_dates]
filtered_dates_2013_q2 = newdata1.loc[q2between_dates]
filtered_dates_2013_q3 = newdata1.loc[q3between_dates]
filtered_dates_2013_q4 = newdata1.loc[q4between_dates]
q1 = filtered_dates_2013_q1.groupby(by=['Gender']).sum()
q2 = filtered_dates_2013_q2.groupby(by=['Gender']).sum()
q3 = filtered_dates_2013_q3.groupby(by=['Gender']).sum()
q4 = filtered_dates_2013_q4.groupby(by=['Gender']).sum()
frames = [q1,q2,q3,q4]
result = pd.concat(frames, axis = 1)
result.columns = ['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4']
result
| Quarter1 | Quarter2 | Quarter3 | Quarter4 | |
|---|---|---|---|---|
| Gender | ||||
| F | NaN | NaN | NaN | 333421908 |
| M | NaN | NaN | NaN | 280221976 |
# Specifying the start and end dates of q1
q1start_2014 = '2014-1-1'
q1end_2014 = '2014-3-31'
# Specifying the start and end dates of q2
q2start_2014 = '2014-4-1'
q2end_2014 = '2014-6-30'
# Specifying the start and end dates of q3
q3start_2014 = '2014-7-1'
q3end_2014 = '2014-9-30'
# Specifying the start and end dates of q4
q4start_2014 = '2014-10-1'
q4end_2014 = '2014-12-31'
q1after_start = newdata1["Date"] >= q1start_2014
q1before_end = newdata1["Date"] <= q1end_2014
q2after_start = newdata1["Date"] >= q2start_2014
q2before_end = newdata1["Date"] <= q2end_2014
q3after_start = newdata1["Date"] >= q3start_2014
q3before_end = newdata1["Date"] <= q3end_2014
q4after_start = newdata1["Date"] >= q4start_2014
q4before_end = newdata1["Date"] <= q4end_2014
q1between_dates = q1after_start & q1before_end
q2between_dates = q2after_start & q2before_end
q3between_dates = q3after_start & q3before_end
q4between_dates = q4after_start & q4before_end
#Filtering the data to only show the rows in each quarter.
filtered_dates_2014_q1 = newdata1.loc[q1between_dates]
filtered_dates_2014_q2 = newdata1.loc[q2between_dates]
filtered_dates_2014_q3 = newdata1.loc[q3between_dates]
filtered_dates_2014_q4 = newdata1.loc[q4between_dates]
q1 = filtered_dates_2014_q1.groupby(by=['Gender']).sum()
q2 = filtered_dates_2014_q2.groupby(by=['Gender']).sum()
q3 = filtered_dates_2014_q3.groupby(by=['Gender']).sum()
q4 = filtered_dates_2014_q4.groupby(by=['Gender']).sum()
frames = [q1,q2,q3,q4]
result = pd.concat(frames, axis = 1)
result.columns = ['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4']
result
| Quarter1 | Quarter2 | Quarter3 | Quarter4 | |
|---|---|---|---|---|
| Gender | ||||
| F | 318768288 | 331421060 | 335382914 | 334689993 |
| M | 281336181 | 269947023 | 290613061 | 292555513 |
# Specifying the start and end dates of q1
q1start_2015 = '2015-1-1'
q1end_2015 = '2015-3-31'
# Specifying the start and end dates of q2
q2start_2015 = '2015-4-1'
q2end_2015 = '2015-6-30'
# Specifying the start and end dates of q3
q3start_2015 = '2015-7-1'
q3end_2015 = '2015-9-30'
# Specifying the start and end dates of q4
q4start_2015 = '2015-10-1'
q4end_2015 = '2015-12-31'
q1after_start = newdata1["Date"] >= q1start_2015
q1before_end = newdata1["Date"] <= q1end_2015
q2after_start = newdata1["Date"] >= q2start_2015
q2before_end = newdata1["Date"] <= q2end_2015
q3after_start = newdata1["Date"] >= q3start_2015
q3before_end = newdata1["Date"] <= q3end_2015
q4after_start = newdata1["Date"] >= q4start_2015
q4before_end = newdata1["Date"] <= q4end_2015
q1between_dates = q1after_start & q1before_end
q2between_dates = q2after_start & q2before_end
q3between_dates = q3after_start & q3before_end
q4between_dates = q4after_start & q4before_end
#Filtering the data to only show the rows in each quarter.
filtered_dates_2015_q1 = newdata1.loc[q1between_dates]
filtered_dates_2015_q2 = newdata1.loc[q2between_dates]
filtered_dates_2015_q3 = newdata1.loc[q3between_dates]
filtered_dates_2015_q4 = newdata1.loc[q4between_dates]
q1 = filtered_dates_2015_q1.groupby(by=['Gender']).sum()
q2 = filtered_dates_2015_q2.groupby(by=['Gender']).sum()
q3 = filtered_dates_2015_q3.groupby(by=['Gender']).sum()
q4 = filtered_dates_2015_q4.groupby(by=['Gender']).sum()
frames = [q1,q2,q3,q4]
result = pd.concat(frames, axis = 1)
result.columns = ['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4']
result
| Quarter1 | Quarter2 | Quarter3 | Quarter4 | |
|---|---|---|---|---|
| Gender | ||||
| F | 344993188 | 206633679 | NaN | NaN |
| M | 279678969 | 175169620 | NaN | NaN |
# Select the specific rows for removing duplicates the data.
select = [2,4,5]
newdata2 = data[data.columns[select]]
# Sorting the data by amount in descending order.
newdata2 = newdata2.sort_values(by = ['Amount'], ascending = True)
# Specifying the start and end dates.
start_2013 = '2013-1-1'
end_2013 = '2013-12-31'
after_start = newdata2["Date"] >= start_2013
before_end = newdata2["Date"] <= end_2013
between_dates = after_start & before_end
#Filtering the data to only show the rows in 2013.
filtered_dates_2013 = newdata2.loc[between_dates]
filtered_dates_2013.groupby(by = ['Exp Type']).sum().sort_values(by = ['Amount'], ascending = True).head(3).T
| Exp Type | Travel | Grocery | Entertainment |
|---|---|---|---|
| Amount | 15353606 | 111806890 | 113094669 |
# Specifying the start and end dates.
start_2014 = '2014-1-1'
end_2014 = '2014-12-31'
after_start = newdata2["Date"] >= start_2014
before_end = newdata2["Date"] <= end_2014
between_dates = after_start & before_end
#Filtering the data to only show the rows in 2014.
filtered_dates_2014 = newdata2.loc[between_dates]
filtered_dates_2014.groupby(by = ['Exp Type']).sum().sort_values(by = ['Amount'], ascending = True).head(3).T
| Exp Type | Travel | Grocery | Entertainment |
|---|---|---|---|
| Amount | 65532848 | 428628961 | 443730570 |
# Specifying the start and end dates.
start_2015 = '2015-1-1'
end_2015 = '2015-12-31'
after_start = newdata2["Date"] >= start_2015
before_end = newdata2["Date"] <= end_2015
between_dates = after_start & before_end
#Filtering the data to only show the rows in 2015.
filtered_dates_2015 = newdata2.loc[between_dates]
filtered_dates_2015.groupby(by = ['Exp Type']).sum().sort_values(by = ['Amount'], ascending = True).head(3).T
| Exp Type | Travel | Entertainment | Grocery |
|---|---|---|---|
| Amount | 28369157 | 169612297 | 177772072 |