#importing necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')


# Load data
df = pd.read_csv('predicting_wages.csv')


# See top 5 row in the dataset
df.head()


# Checking info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3835 entries, 0 to 3834
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   female  3835 non-null   int64  
 1   cg      3835 non-null   int64  
 2   sc      3835 non-null   int64  
 3   hsg     3835 non-null   int64  
 4   mw      3835 non-null   int64  
 5   so      3835 non-null   int64  
 6   we      3835 non-null   int64  
 7   ne      3835 non-null   int64  
 8   exp1    3835 non-null   float64
 9   exp2    3835 non-null   float64
 10  exp3    3835 non-null   float64
 11  wage    3835 non-null   float64
dtypes: float64(4), int64(8)
memory usage: 359.7 KB


# Printing the summary statistics for the dataset
df.describe().T


df[['exp1','exp2','exp3','wage']].boxplot(figsize=(20,10))
plt.show()


sns.scatterplot('exp1','wage',data=df)

<AxesSubplot:xlabel='exp1', ylabel='wage'>


cols=df.select_dtypes('int').columns.to_list()
cols

['female', 'cg', 'sc', 'hsg', 'mw', 'so', 'we', 'ne']


sns.kdeplot(df.wage,hue=df.female)

<AxesSubplot:xlabel='wage', ylabel='Density'>


for i in cols:
    sns.kdeplot(df.wage,hue=df[i])
    plt.show()


for i in cols:
    sns.scatterplot('exp1','wage',hue=i,data=df)
    plt.show()


####################  Linear and Quadratic specifications ##############################

# Wage linear regression
from sklearn.linear_model import LinearRegression
from sklearn import metrics

Y = df['wage'] # target variable
X = df[['female' , 'sc', 'cg', 'mw' , 'so' , 'we' , 'exp1' , 'exp2' , 'exp3']] #regressors

#defining the model
model = LinearRegression()
# fit the Linear regression to the regressors and target varibale.
results = model.fit(X,Y) # train the model

print("Intercept",results.intercept_) # beta_0

Intercept 4.915401521241382


#coefficients of other regressors
pd.DataFrame(results.coef_.reshape(1,-1),columns=X.columns)


# compute MSE and R^2
y_pred = results.predict(X) # predictions on same data
print("R-squared",metrics.r2_score(Y,y_pred)) # r-squared
MSE_adj2 =  (3835/(3835-10)) * np.mean(np.square(Y-y_pred))
print("MSE adj:",MSE_adj2) #MSE

R-squared 0.0954880031243256
MSE adj: 165.68015524312761


adj_rsquared = 1 - ((1-0.095488)*(3835-1)/(3835-9-1))
print("Adj Rsquared:",adj_rsquared)# adjusted r-squared

Adj Rsquared: 0.09335973647058826


# Linear regression: Quadratic specification

from sklearn.preprocessing import PolynomialFeatures

X.drop('female',axis = 1,inplace = True)
poly = PolynomialFeatures(interaction_only=True,include_bias=False)
X_poly = poly.fit_transform(X) # creating polynomial features with degree = 2(Quadratic)

X_poly = pd.DataFrame(X_poly,columns= poly.get_feature_names(X.columns))

X_poly['female'] = df['female']
X['female'] = df['female']

# train the model
results = model.fit(X_poly,Y) 

# beta_0
print("Intercept",results.intercept_)

Intercept 16.552407194295007


pd.DataFrame(results.coef_.reshape(1,-1),columns=X_poly.columns).T# printing the co-efficients in a tabular format


# compute MSE and R^2
y_pred = results.predict(X_poly)
print("R-squared",metrics.r2_score(Y,y_pred))
MSE_adj2 =  (3835/(3835-33)) * np.mean(np.square(Y-y_pred))
print("MSE adj:",MSE_adj2) #MSE

R-squared 0.10397282164398391
MSE adj: 165.1188560892233


adj_rsquared = 1 - ((1-0.10397)*(3835-1)/(3835-33-1))
adj_rsquared

0.09619073401736389


####################  Linear and Quadratic specifications with Sample Splitting ##############################

from sklearn.model_selection import train_test_split #using this we split the data into train and test in python.


#Split without the function train_test_split
X_train = X.iloc[:1918,:]
X_test = X.iloc[1918:,:]
Y_train = Y.iloc[:1918]
Y_test = Y.iloc[1918:]


#Split using the function train_test_split
# X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.5,random_state=101) # splitting data into train and test


results = model.fit(X_train,Y_train)
print("Intercept",results.intercept_)

Intercept 4.184615914173756


pd.DataFrame(results.coef_.reshape(1,-1),columns=X.columns)


y_pred = results.predict(X_test) # predit y values on test data
print("R-squared",metrics.r2_score(Y_test,y_pred)) # r-squared
print("MSE",metrics.mean_squared_error(Y_test,y_pred))# MSE

R-squared 0.10272836637029858
MSE 154.58369290508847


# Linear regression: Quadratic specification
X_train = X_poly.iloc[:1918,:]
X_test = X_poly.iloc[1918:,:]
Y_train = Y.iloc[:1918]
Y_test = Y.iloc[1918:]
# X_train,X_test,Y_train,Y_test = train_test_split(X_poly,Y,test_size=0.5,random_state=101)
results = model.fit(X_train,Y_train)
print("Intercept",results.intercept_)

Intercept 12.36412206922852


pd.DataFrame(results.coef_.reshape(1,-1),columns=X_poly.columns)


y_pred = results.predict(X_test)
print("R-squared",metrics.r2_score(Y_test,y_pred))
print("MSE",metrics.mean_squared_error(Y_test,y_pred))

R-squared 0.10460461287646161
MSE 154.26044952721676

	female	cg	sc	hsg	ne	exp1	exp2	exp3	wage
0	0	0	0	1	1	33.0	10.89	35.937	11.659091
1	0	1	0	0	1	27.0	7.29	19.683	12.825000
2	0	0	1	0	1	13.0	1.69	2.197	5.777027
3	0	1	0	0	1	2.0	0.04	0.008	12.468750
4	1	1	0	0	1	15.0	2.25	3.375	18.525000

	count	mean	std	min	25%	50%	75%	max
female	3835.0	0.417992	0.493293	0.000000	0.00000	0.000000	1.000000	1.000000
cg	3835.0	0.376271	0.484513	0.000000	0.00000	0.000000	1.000000	1.000000
sc	3835.0	0.323859	0.468008	0.000000	0.00000	0.000000	1.000000	1.000000
hsg	3835.0	0.299870	0.458260	0.000000	0.00000	0.000000	1.000000	1.000000
mw	3835.0	0.287614	0.452709	0.000000	0.00000	0.000000	1.000000	1.000000
so	3835.0	0.243546	0.429278	0.000000	0.00000	0.000000	0.000000	1.000000
we	3835.0	0.211734	0.408591	0.000000	0.00000	0.000000	0.000000	1.000000
ne	3835.0	0.257106	0.437095	0.000000	0.00000	0.000000	1.000000	1.000000
exp1	3835.0	13.353194	8.639348	2.000000	6.00000	11.000000	19.500000	35.000000
exp2	3835.0	2.529267	2.910554	0.040000	0.36000	1.210000	3.802500	12.250000
exp3	3835.0	5.812103	9.033207	0.008000	0.21600	1.331000	7.414875	42.875000
wage	3835.0	15.533356	13.518165	0.004275	9.61875	13.028571	17.812500	348.333017

	0
sc	-2.386526e+00
cg	2.240488e+00
mw	-5.519395e+00
so	-2.914391e+00
we	-8.053925e-01
exp1	-1.321456e+00
exp2	1.252185e+01
exp3	-4.838676e-02
sc cg	6.084022e-14
sc mw	-7.225602e-01
sc so	-6.512599e-01
sc we	-1.046514e-01
sc exp1	8.390751e-01
sc exp2	-4.060847e+00
sc exp3	6.330261e-01
cg mw	-7.609079e-01
cg so	1.704052e+00
cg we	-1.494761e+00
cg exp1	7.858795e-01
cg exp2	-4.897043e-02
cg exp3	-5.950344e-01
mw so	2.264855e-14
mw we	4.857226e-14
mw exp1	1.107607e+00
mw exp2	-6.052683e+00
mw exp3	9.062626e-01
so we	2.664535e-15
so exp1	3.947042e-01
so exp2	-8.913739e-01
so exp3	-3.515656e-02
we exp1	4.719171e-01
we exp2	-3.918829e+00
we exp3	8.050007e-01
exp1 exp2	-4.838676e-01
exp1 exp3	8.720600e-02
exp2 exp3	-5.798236e-02
female	-1.880013e+00

PREDICTING WAGES¶

Data Dictionary¶

Importing libraries¶

Checking the info of the dataset¶

Univariate Analysis¶

Checking the summary statistics of the dataset¶

Bivariate Analysis¶

Let's first look the relationship between the experience and Wages¶

Now make a list of dummy columns and check there relationship with wage¶

Basic Model¶

Flexible model¶

Basic Model on splitted data¶

Flexible model on splitted data¶

Conclusion and recommendations¶

	p	R-squared_sample	R-squared_adj	MSE_adj
basic reg	10	0.0954	0.093	165.680
flexi reg	33	0.1039	0.096	165.118

	p	R-squared_test	MSE_test
basic reg	10	0.1027	154.584
flexi reg	33	0.1046	154.260