#Domain – HR
#focus – Insights from data
import pandas as pd
# Set the option on how to display float
# To See the impact of this command comment it out and check outputs
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Read Salaries.csv as a dataframe called salary
salary = pd.read_csv('Salaries.csv')
# Check the basic data
print(salary.head())
# Get more info on data & Verify that 148,648 records are there
print (salary.info())
# Get Some statistical summary of data
print (salary.describe())
# Check the total salary cost per year and see how it has increased over years
sum_year = salary.groupby('Year').sum()['TotalPayBenefits']
print ( sum_year)
# Question 1 : Compute how much total salary cost has increased from 2011 to 2014.
Code:
mean_year = salary.groupby('Year').mean()['TotalPayBenefits']
print ( mean_year)
# Question 2 : Which Job Title in Year 2014 has highest mean salary
grouped_df_JobTitle = salary.query("Year == 2014").groupby(["JobTitle"]).agg("mean").filter(["JobTitle", "TotalPay"])
max_min_salary = grouped_df_JobTitle[grouped_df_JobTitle["TotalPay"] == grouped_df_JobTitle["TotalPay"].max()]
print("highest mean Salary job title is " + str(max_min_salary))
# Question 3 : How much money could have been saved in Year 2014 by stopping OverTimePay
over_time_pay_year = salary.groupby(["Year"]).sum()[["OvertimePay"]].query("Year == 2014")
print("Money could have been saved in Year 2014 by stopping OverTimePay :"+ str(float(over_time_pay_year["OvertimePay"])))
Question 4: Which are the top 5 common job in Year 2014 and how much do they cost SFO
top_job_title = salary[salary['Year'] == 2014]['JobTitle'].value_counts().head(5)
print (top_job_title)
# Uncomment this and check what it prints
#print (type(top_job_title))
# Calculate the Cost
sum_cost = 0
for index,value in top_job_title.iteritems():
print(index,value)
sum_cost += sum(salary[ (salary['Year']== 2014) & (salary['JobTitle'] == index)]['TotalPayBenefits'])
print (" Total Cost of Top 5 Jobs in Year 2014 ", sum_cost)
# Question 5 : Who was the top earning employee across all the years?
grouped_df_5_top_earners = df.groupby(["Year", "TotalPay"]).agg("max").filter(["Year", "TotalPay", "EmployeeName"])
print(grouped_df_5_top_earners)