# Let us import all the necessary packages in order to perform our analysis.


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


data = pd.read_excel("transactions.xlsx", sheet_name = "Credit-Card-Transactions")
data


type(data)

pandas.core.frame.DataFrame


data.describe()


# Lets split the city column and remove the country 'India' as we already know all the cities are from india.


data['City_Name'] = data['City'].str.split(',', 1, expand = True)
data


# Drop the City column, and move city column to the second position


data = data.drop(['City'], axis = 1)


new_order = [5,1,2,3,0,4]
data = data[data.columns[new_order]]
data


# Select the specific rows for removing duplicates the data.
select = [0,4,5]
newdata = data[data.columns[select]]             

# Sorting the data by amount in descending order.
newdata = newdata.sort_values(by = ['Amount'], ascending = False)

# Specifying the start and end dates.
start_2013 = '2013-1-1'
end_2013 = '2013-12-31'

after_start = newdata["Date"] >= start_2013
before_end = newdata["Date"] <= end_2013

between_dates = after_start & before_end

#Filtering the data to only show the rows in 2013.
filtered_dates_2013 = newdata.loc[between_dates]

#Removing duplicate cityes and displaying the top 10.
filtered_dates_2013_new = filtered_dates_2013.groupby(by=["City_Name"]).sum()
filtered_dates_2013_new.sort_values(by = ['Amount'], ascending = False).head(5).T


# Specifying the start and end dates.
start_2014 = '2014-1-1'
end_2014 = '2014-12-31'

after_start = newdata["Date"] >= start_2014
before_end = newdata["Date"] <= end_2014

between_dates = after_start & before_end

#Filtering the data to only show the rows in 2014.
filtered_dates_2014 = newdata.loc[between_dates]

#Removing duplicate cityes and displaying the top 10.
filtered_dates_2014 = filtered_dates_2014.groupby(by=["City_Name"]).sum()
filtered_dates_2014.sort_values(by = ['Amount'], ascending = False).head(5).T


# Specifying the start and end dates.
start_2015 = '2015-1-1'
end_2015 = '2015-12-31'

after_start = newdata["Date"] >= start_2015
before_end = newdata["Date"] <= end_2015

between_dates = after_start & before_end

#Filtering the data to only show the rows in 2014.
filtered_dates_2015 = newdata.loc[between_dates]

#Removing duplicate cityes and displaying the top 10.
filtered_dates_2015 = filtered_dates_2015.groupby(by=["City_Name"]).sum()
filtered_dates_2015.sort_values(by = ['Amount'], ascending = False).head(5).T


# Lets filter the dataset by year first, pick the specific columns from the dataset
select = [3,4,5]
newdata1 = data[data.columns[select]] 

# Specifying the start and end dates.
start_2013 = '2013-1-1'
end_2013 = '2013-12-30'

after_start = newdata1["Date"] >= start_2013
before_end = newdata1["Date"] <= end_2013

between_dates = after_start & before_end

#Filtering the data to only show the rows in 2013.
filtered_dates_2013 = newdata1.loc[between_dates]

filtered_dates_2013


# Specifying the start and end dates of q1
q1start_2013 = '2013-1-1'
q1end_2013 = '2013-3-31'

# Specifying the start and end dates of q2
q2start_2013 = '2013-4-1'
q2end_2013 = '2013-6-30'

# Specifying the start and end dates of q3
q3start_2013 = '2013-7-1'
q3end_2013 = '2013-9-30'

# Specifying the start and end dates of q4
q4start_2013 = '2013-10-1'
q4end_2013 = '2013-12-31'


q1after_start = newdata1["Date"] >= q1start_2013
q1before_end = newdata1["Date"] <= q1end_2013

q2after_start = newdata1["Date"] >= q2start_2013
q2before_end = newdata1["Date"] <= q2end_2013

q3after_start = newdata1["Date"] >= q3start_2013
q3before_end = newdata1["Date"] <= q3end_2013

q4after_start = newdata1["Date"] >= q4start_2013
q4before_end = newdata1["Date"] <= q4end_2013


q1between_dates = q1after_start & q1before_end
q2between_dates = q2after_start & q2before_end
q3between_dates = q3after_start & q3before_end
q4between_dates = q4after_start & q4before_end


#Filtering the data to only show the rows in each quarter.
filtered_dates_2013_q1 = newdata1.loc[q1between_dates]
filtered_dates_2013_q2 = newdata1.loc[q2between_dates]
filtered_dates_2013_q3 = newdata1.loc[q3between_dates]
filtered_dates_2013_q4 = newdata1.loc[q4between_dates]

q1 = filtered_dates_2013_q1.groupby(by=['Gender']).sum()
q2 = filtered_dates_2013_q2.groupby(by=['Gender']).sum()
q3 = filtered_dates_2013_q3.groupby(by=['Gender']).sum()
q4 = filtered_dates_2013_q4.groupby(by=['Gender']).sum()

frames = [q1,q2,q3,q4]
result = pd.concat(frames, axis = 1)

result.columns = ['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4']

result


# Specifying the start and end dates of q1
q1start_2014 = '2014-1-1'
q1end_2014 = '2014-3-31'

# Specifying the start and end dates of q2
q2start_2014 = '2014-4-1'
q2end_2014 = '2014-6-30'

# Specifying the start and end dates of q3
q3start_2014 = '2014-7-1'
q3end_2014 = '2014-9-30'

# Specifying the start and end dates of q4
q4start_2014 = '2014-10-1'
q4end_2014 = '2014-12-31'


q1after_start = newdata1["Date"] >= q1start_2014
q1before_end = newdata1["Date"] <= q1end_2014

q2after_start = newdata1["Date"] >= q2start_2014
q2before_end = newdata1["Date"] <= q2end_2014

q3after_start = newdata1["Date"] >= q3start_2014
q3before_end = newdata1["Date"] <= q3end_2014

q4after_start = newdata1["Date"] >= q4start_2014
q4before_end = newdata1["Date"] <= q4end_2014


q1between_dates = q1after_start & q1before_end
q2between_dates = q2after_start & q2before_end
q3between_dates = q3after_start & q3before_end
q4between_dates = q4after_start & q4before_end


#Filtering the data to only show the rows in each quarter.
filtered_dates_2014_q1 = newdata1.loc[q1between_dates]
filtered_dates_2014_q2 = newdata1.loc[q2between_dates]
filtered_dates_2014_q3 = newdata1.loc[q3between_dates]
filtered_dates_2014_q4 = newdata1.loc[q4between_dates]

q1 = filtered_dates_2014_q1.groupby(by=['Gender']).sum()
q2 = filtered_dates_2014_q2.groupby(by=['Gender']).sum()
q3 = filtered_dates_2014_q3.groupby(by=['Gender']).sum()
q4 = filtered_dates_2014_q4.groupby(by=['Gender']).sum()

frames = [q1,q2,q3,q4]
result = pd.concat(frames, axis = 1)

result.columns = ['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4']

result


# Specifying the start and end dates of q1
q1start_2015 = '2015-1-1'
q1end_2015 = '2015-3-31'

# Specifying the start and end dates of q2
q2start_2015 = '2015-4-1'
q2end_2015 = '2015-6-30'

# Specifying the start and end dates of q3
q3start_2015 = '2015-7-1'
q3end_2015 = '2015-9-30'

# Specifying the start and end dates of q4
q4start_2015 = '2015-10-1'
q4end_2015 = '2015-12-31'


q1after_start = newdata1["Date"] >= q1start_2015
q1before_end = newdata1["Date"] <= q1end_2015

q2after_start = newdata1["Date"] >= q2start_2015
q2before_end = newdata1["Date"] <= q2end_2015

q3after_start = newdata1["Date"] >= q3start_2015
q3before_end = newdata1["Date"] <= q3end_2015

q4after_start = newdata1["Date"] >= q4start_2015
q4before_end = newdata1["Date"] <= q4end_2015


q1between_dates = q1after_start & q1before_end
q2between_dates = q2after_start & q2before_end
q3between_dates = q3after_start & q3before_end
q4between_dates = q4after_start & q4before_end


#Filtering the data to only show the rows in each quarter.
filtered_dates_2015_q1 = newdata1.loc[q1between_dates]
filtered_dates_2015_q2 = newdata1.loc[q2between_dates]
filtered_dates_2015_q3 = newdata1.loc[q3between_dates]
filtered_dates_2015_q4 = newdata1.loc[q4between_dates]

q1 = filtered_dates_2015_q1.groupby(by=['Gender']).sum()
q2 = filtered_dates_2015_q2.groupby(by=['Gender']).sum()
q3 = filtered_dates_2015_q3.groupby(by=['Gender']).sum()
q4 = filtered_dates_2015_q4.groupby(by=['Gender']).sum()

frames = [q1,q2,q3,q4]
result = pd.concat(frames, axis = 1)

result.columns = ['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4']

result


# Select the specific rows for removing duplicates the data.
select = [2,4,5]
newdata2 = data[data.columns[select]]             

# Sorting the data by amount in descending order.
newdata2 = newdata2.sort_values(by = ['Amount'], ascending = True)

# Specifying the start and end dates.
start_2013 = '2013-1-1'
end_2013 = '2013-12-31'

after_start = newdata2["Date"] >= start_2013
before_end = newdata2["Date"] <= end_2013

between_dates = after_start & before_end

#Filtering the data to only show the rows in 2013.
filtered_dates_2013 = newdata2.loc[between_dates]
filtered_dates_2013.groupby(by = ['Exp Type']).sum().sort_values(by = ['Amount'], ascending = True).head(3).T


# Specifying the start and end dates.
start_2014 = '2014-1-1'
end_2014 = '2014-12-31'

after_start = newdata2["Date"] >= start_2014
before_end = newdata2["Date"] <= end_2014

between_dates = after_start & before_end

#Filtering the data to only show the rows in 2014.
filtered_dates_2014 = newdata2.loc[between_dates]
filtered_dates_2014.groupby(by = ['Exp Type']).sum().sort_values(by = ['Amount'], ascending = True).head(3).T


# Specifying the start and end dates.
start_2015 = '2015-1-1'
end_2015 = '2015-12-31'

after_start = newdata2["Date"] >= start_2015
before_end = newdata2["Date"] <= end_2015

between_dates = after_start & before_end

#Filtering the data to only show the rows in 2015.
filtered_dates_2015 = newdata2.loc[between_dates]
filtered_dates_2015.groupby(by = ['Exp Type']).sum().sort_values(by = ['Amount'], ascending = True).head(3).T

	City	Date	Card Type	Exp Type	Gender	Amount
0	Delhi, India	2014-10-29	Gold	Bills	F	82475
1	Greater Mumbai, India	2014-08-22	Platinum	Bills	F	32555
2	Bengaluru, India	2014-08-27	Silver	Bills	F	101738
3	Greater Mumbai, India	2014-04-12	Signature	Bills	F	123424
4	Bengaluru, India	2015-05-05	Gold	Bills	F	171574
...	...	...	...	...	...	...
26047	Kolkata, India	2014-06-22	Silver	Travel	F	128191
26048	Pune, India	2014-08-03	Signature	Travel	M	246316
26049	Hyderabad, India	2015-01-16	Silver	Travel	M	265019
26050	Kanpur, India	2014-09-14	Silver	Travel	M	88174
26051	Hyderabad, India	2013-10-19	Signature	Bills	M	184410

	Amount
count	26052.000000
mean	156411.537425
std	103063.254287
min	1005.000000
25%	77120.250000
50%	153106.500000
75%	228050.000000
max	998077.000000

	City	Date	Card Type	Exp Type	Gender	Amount	City_Name
0	Delhi, India	2014-10-29	Gold	Bills	F	82475	Delhi
1	Greater Mumbai, India	2014-08-22	Platinum	Bills	F	32555	Greater Mumbai
2	Bengaluru, India	2014-08-27	Silver	Bills	F	101738	Bengaluru
3	Greater Mumbai, India	2014-04-12	Signature	Bills	F	123424	Greater Mumbai
4	Bengaluru, India	2015-05-05	Gold	Bills	F	171574	Bengaluru
...	...	...	...	...	...	...	...
26047	Kolkata, India	2014-06-22	Silver	Travel	F	128191	Kolkata
26048	Pune, India	2014-08-03	Signature	Travel	M	246316	Pune
26049	Hyderabad, India	2015-01-16	Silver	Travel	M	265019	Hyderabad
26050	Kanpur, India	2014-09-14	Silver	Travel	M	88174	Kanpur
26051	Hyderabad, India	2013-10-19	Signature	Bills	M	184410	Hyderabad

	City_Name	Card Type	Exp Type	Gender	Date	Amount
0	Delhi	Gold	Bills	F	2014-10-29	82475
1	Greater Mumbai	Platinum	Bills	F	2014-08-22	32555
2	Bengaluru	Silver	Bills	F	2014-08-27	101738
3	Greater Mumbai	Signature	Bills	F	2014-04-12	123424
4	Bengaluru	Gold	Bills	F	2015-05-05	171574
...	...	...	...	...	...	...
26047	Kolkata	Silver	Travel	F	2014-06-22	128191
26048	Pune	Signature	Travel	M	2014-08-03	246316
26049	Hyderabad	Silver	Travel	M	2015-01-16	265019
26050	Kanpur	Silver	Travel	M	2014-09-14	88174
26051	Hyderabad	Signature	Bills	M	2013-10-19	184410

	Gender	Date	Amount
19	F	2013-11-09	735566
23	F	2013-10-11	668568
32	F	2013-11-24	227702
40	F	2013-10-27	678842
47	F	2013-11-04	313626
...	...	...	...
26011	F	2013-11-13	52609
26028	F	2013-11-20	263420
26040	F	2013-10-26	163352
26046	M	2013-10-09	198903
26051	M	2013-10-19	184410

This is a report on Fraud Detection and Data Analysis using different packages.¶

Section 1: Data cleaning¶

Report 1: Top 5 expenses of different cities split by year.¶

Report 2: Amount by quarter and gender - Analysis¶

Report 3: Expense Type - Analysis¶

	Quarter1	Quarter2	Quarter3	Quarter4
Gender
F	318768288	331421060	335382914	334689993
M	281336181	269947023	290613061	292555513

	Quarter1	Quarter2	Quarter3	Quarter4
Gender
F	NaN	NaN	NaN	333421908
M	NaN	NaN	NaN	280221976

	Quarter1	Quarter2	Quarter3	Quarter4
Gender
F	344993188	206633679	NaN	NaN
M	279678969	175169620	NaN	NaN