The salary of data science

#data dealed by pandas #visualization and analysis

data visualization

data dealed by pandas
data cleaning
data visualization
data analysis(sklearn)

It’s a specify example of data visualization with 4 problems

the data is from Kaggle

1. what is the most popular level in data science? and what is the average salary for each level?
1. What’s the different salary for different kind of employment type and the remote_ratio?
1. What’s the different salary for different kind of company size and the location?
1. What’s the different salary for different kind of company size?

1. deal with the data basicly

df = pd.read_csv('ds_salaries.csv')
# print(df.head(5))
# print(df.info())


""" deal with the dataset"""
# 1. give some key words to classify the job
def classify_job(job_title):
    data_analyst = ['Data Architect', 'Analytics', 'Analyst']
    machine_learning = ['Machine', 'ML', 'Machine Learning']
    data_scientist = ['Data Scientist', 'Data Engineer','BI']
    manager = ['Manager', 'Director', 'VP', 'Head']
    count = 0

    for i in data_analyst:
        if i in job_title:
            return 'Data Analyst'
            count += 1
    for i in machine_learning:
        if i in job_title:
            return 'Machine Learning'
            count += 1
    for i in data_scientist:
        if i in job_title:
            return 'Data Scientist'
            count += 1
    for i in manager:
        if i in job_title:
            return 'Manager'
            count += 1
    if count == 0:
        return 'Other'
    else:
        pass
df['job_category'] = df['job_title'].apply(classify_job)


# 2. change the country code to country name
def change_country(alpha_2):
    country = pycountry.countries.get(alpha_2=alpha_2)
    if country is not None:
        return country.name
    else:
        return 'Unknown'

df['company_location'] = df['company_location'].apply(change_country)


# 3. clean the dataset
if df.isnull().any().any() or df.isna().any().any():
    df.replace(" ", np.nan)
    df.fillna(0)

else:
    print("the dataset is good")


# 4. rename the column name
df['employment_type'] = df['employment_type'].replace({
    'FL': 'Freelancer',
    'CT': 'Contractor',
    'FT': 'Full-time',
    'PT': 'Part-time'
})
df['company_size'] = df['company_size'].replace({
    'S': 'SMALL',
    'M': 'MEDIUM',
    'L': 'LARGE',
})
df['experience_level'] = df['experience_level'].replace({
    'SE': ' Senior-level',
    'MI': 'Mid-level',
    'EN': 'Entry-level',
    'EX': 'Executive-level'
})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 

 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB
None
   work_year experience_level employment_type                 job_title  salary salary_currency  salary_in_usd employee_residence  remote_ratio company_location company_size
0       2023               SE              FT  Principal Data Scientist   80000             EUR          85847                 ES           100               ES            L
1       2023               MI              CT               ML Engineer   30000             USD          30000                 US           100               US            S
2       2023               MI              CT               ML Engineer   25500             USD          25500                 US           100               US            S
3       2023               SE              FT            Data Scientist  175000             USD         175000                 CA           100               CA            M
4       2023               SE              FT            Data Scientist  120000             USD         120000                 CA           100               CA            M

2. Do the data exploration analysis and visulization

print(df.describe())

# the relationship between salary and different kind of jobs
job = df.groupby('job_category')
values_count = df['job_category'].value_counts(normalize=True)*100

# the percentage of different job category
fig , ax = plt.subplots(figsize=(10,7))
sns.barplot(x=values_count.index,y=values_count,ax=ax)
plt.title("Job Category",fontsize=15,color='brown')
plt.xlabel("Job Category",fontsize=13)
plt.ylabel("Percentage",fontsize=13)

# the average salary for different job category

list_job = ['Data Analyst', 'Data Scientist', 'Machine Learning', 'Manager', 'Other']
list_mean_salary = list(job['salary'].mean())
list_min_salary = list(job['salary'].min())
list_max_salary = list(job['salary'].max())

color = ['C0', 'C1', 'C2', 'C3', 'C4']
fig, ax = plt.subplots(figsize=(10, 7))
im = ax.imshow([list_mean_salary, list_min_salary,list_max_salary], cmap='YlGnBu')
ax.set_xticks(np.arange(len(list_job)))
ax.set_yticks(np.arange(3))
ax.set_xticklabels(list_job)
ax.set_yticklabels(['mean', 'min', 'max'])
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")
for i in range(3):
    for j in range(len(list_job)):
        text = ax.text(j, i, [list_mean_salary, list_min_salary,list_max_salary][i][j],
                       ha="center", va="center", color="black")
ax.set_title("The average salary for different job category",fontsize=15,color='brown')
fig.tight_layout()

3. Data Diagnostic analysis and visulization

I. salary for different level of worker and the percentage of different level of worker

plt.figure(figsize=(15, 9))
level = df.groupby('experience_level')
EN_money = level['salary'].mean()

ax1 = plt.subplot(1,2,1)
plt.title("different level for worker",fontsize=13,color = 'brown')
labels = ['Entry-level','Executive-level ','Mid-level','Senior-level']
plt.pie(EN_money,labels=labels , autopct='%1.1f',shadow=True,colors=['C0', 'C1', 'C2', 'C3'])

ax2 = plt.subplot(1,2,2)
plt.title("different level for worker",fontsize = 13,color = 'brown')
plt.xlabel("level")
plt.ylabel("average salary",rotation="vertical",fontdict={'verticalalignment':'baseline'})
plt.xlabel("level for worker")
plt.bar(labels,EN_money,color=['C5', 'C7', 'C2', 'C1'])

II. relationship between remote_ratio, employtype, salary

grouped = df.groupby('employment_type')
employment = grouped['salary'].mean().index.tolist()
average_salary = []
for i in employment:
    remote_group = grouped.get_group(i).loc[:, ['salary', 'remote_ratio']]
    average_salary.append(remote_group.groupby('remote_ratio')['salary'].mean().tolist())

for i in range(len(average_salary)):
    if len(average_salary[i]) != 3:
        average_salary[i].append(0)




bar3d = Bar3D()
xaxis = [0,1,2,3]
ratio = df.groupby('remote_ratio')['salary'].mean().index.tolist()
yaxis = [0,1,2]
data = average_salary
data = [[xaxis[i], yaxis[j], data[i][j]] for i in range(len(xaxis)) for j in range(len(yaxis))]



(
bar3d.add(
    series_name="",
    data=data,
    xaxis3d_opts=opts.Axis3DOpts(type_="category", data=employment, name="employment_type", name_gap=30,
                                 is_show=True,grid_3d_index=0,axislabel_opts=opts.LabelOpts(rotate=-30)),
    yaxis3d_opts=opts.Axis3DOpts(type_="category", data=ratio, name="remote_ratio", name_gap=30,is_show=True,grid_3d_index=0,
                                 axislabel_opts=opts.LabelOpts(rotate=-30)),
    zaxis3d_opts=opts.Axis3DOpts(type_="value", name="salary", name_gap=30,is_show=True,grid_3d_index=0,),
    grid3d_opts=opts.Grid3DOpts(view_control_alpha=60, view_control_beta=10),
)
.set_global_opts(
    
    title_opts=opts.TitleOpts(title="the salary of remote_ration and employment_type "),
    visualmap_opts=opts.VisualMapOpts(
        max_=800000,
        range_color=[
            "#313695",
            "#4575b4",
            "#74add1",
            "#abd9e9",
            "#e0f3f8",
            "#ffffbf",
            "#fee090",
            "#fdae61",
            "#f46d43",
            "#d73027",
            "#a50026",
        ],
        
    )
)
)

III. represent the number of companies and the average salary in each country

group_country = df.groupby('company_location')
countries_mean = group_country['salary'].mean()
country_money = []
for i in countries_mean.index.tolist():
    country_money.append([i, countries_mean[i]])

group_country = df.groupby('company_location')
countries_mean = group_country['salary'].mean()
country_money = []
for i in countries_mean.index.tolist():
    country_money.append([i, countries_mean[i]])

size = group_country['company_size'].value_counts()

size = size.unstack() # unstack the data
size = size.fillna(0)
size = size.astype(int) # change the data type
size = size.reset_index() # reset the index
size = size.values.tolist()


for i in range(len(size)):
        arr = size[i][1:]
        company_size = [size[i][0], arr]
        size[i] = company_size

    

c = (
    Map()
    .add("salary", country_money, "world")
    .add("company_size", size, "world")
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Map-different salary for different kind of company size and the location",pos_left='center',pos_top='top',padding=20),
        visualmap_opts=opts.VisualMapOpts(max_=300),
)

)

IV. reflect the average salary of the three sizes

jobs_money = df.loc[:, ['job_category', 'salary','company_size']]
plt,ax = plt.subplots(figsize=(10,7))
sns.barplot(x='job_category', y='salary', hue='company_size', data=jobs_money)
ax.set_title("The salary for different company size",fontsize=15,color='brown')
ax.set_xlabel("job category",fontsize=13)
ax.set_ylabel("salary",fontsize=13)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '${:,.0f}'.format(y)))

3. Data predictive analysis and visulization

use the Random Forest to predict the job_category based on the company_location, employee_residence, employment_type, salary

string_features = ['company_location', 'employee_residence', 'employment_type']
string_data = df[string_features]

# one-hot encoding
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(string_data)
encoded_df = pd.DataFrame(encoded_data.toarray(), columns = encoder.get_feature_names_out(string_features))
numeric_df = df['salary']
merged_df = pd.concat([encoded_df, numeric_df], axis=1)

# split the data
X_train, X_test , Y_train, Y_test = train_test_split(merged_df.values,df['job_category'], test_size=0.2, random_state=42)


classifier = RandomForestClassifier(n_estimators=100, random_state=42,criterion = 'entropy')
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_test)


result = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = accuracy_score(Y_test, y_pred)
print("Accuracy:",result1)
result2 = classification_report(Y_test, y_pred)
print("Classification Report:",)
print (result2)

the result of the prediction

Confusion Matrix:
[[ 89  97   5   0   7]
 [ 49 293  19   5  13]
 [ 14  46  17   3   3]
 [  3  11   3   6   2]
 [ 10  37   6   3  10]]
Accuracy: 0.5525965379494008

Classification Report:
                  precision    recall  f1-score   support

    Data Analyst       0.54      0.45      0.49       198
  Data Scientist       0.61      0.77      0.68       379
Machine Learning       0.34      0.20      0.26        83
         Manager       0.35      0.24      0.29        25
           Other       0.29      0.15      0.20        66

        accuracy                           0.55       751
       macro avg       0.42      0.36      0.38       751
    weighted avg       0.52      0.55      0.53       751

prediction visualization

# predict the job category
input_data =np.random.rand(1, 154)
predicted_data = np.append(input_data, 94100)
# predicted_data = scaler.transform(predicted_data.reshape(1, -1))
predicted_data = predicted_data.reshape(1, -1)
predicted_job_category = classifier.predict(predicted_data)
print("Predicted job_categrory: ", predicted_job_category)


# draw the heat map
confusion_mat = result
print(confusion_mat)
normalized_mat = confusion_mat / confusion_mat.sum(axis=1)[:, np.newaxis]
print(normalized_mat)
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mat, annot=True, cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix Heatmap')