import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Reading the Data
pd.set_option('display.max_columns', 40)
data=pd.read_csv(r"C:\Users\Shashank Sundi\Desktop\I neuron\assignments\SCM SHIPMENT PRICING\SCMS_Delivery_History_Dataset.csv")
data.head()
# pack price = unit measure per pack * unit price
# Line item value = no. of items in line * price of a pack
# Target column -----> Shipment Price( total value of transporting the shipment to destination)=freight cost + line item value + Line item insurance
# So we'll create a new feature for that later on
# Checking and Validating Data Types
pd.DataFrame(data=data.dtypes.values,index=data.dtypes.index).T
# we can see that columns - Scheduled Delivery Date , Delivered to Client Date, Delivery Recorded Date ,PQ First Sent to Client Date,PO Sent to Vendor Date are dates , but are identified as 'object' type
# We can also see that columns - Weight (Kilograms) , Freight Cost (USD) are actually numeric values , but are classified as 'object' type
# We need to change the data types of above identified columns
for i in ['Weight (Kilograms)', 'Freight Cost (USD)','PQ First Sent to Client Date','PO Sent to Vendor Date']:
print(data[i].value_counts().head(),"\n\n")
# in above cell we observe that some of the weights were weighed separately and hence we don't have info about those weights , so we replace them with nan
# moreover , in freight cost column , the cost was either not read from the ASN/DN , or Invoiced Separately etc. ; hence it says to read from the tag itself manually ,
# which is not feasible for every product , so we replace it with nan
# To capture the importance of the fact that it was measured separately ,Weight Captured Separately , freight included in commo..etc. , we'll create a new feature , else we might lose
# some important info
# Creating new feature features to capture importance of special cases where measurements could either not be read , or were recorded separately
# For Freight Cost
data['Freight_cost_special']=0
for i in range(data.shape[0]):
if data['Freight Cost (USD)'].loc[i] in ["Freight Included in Commodity Cost",'Invoiced Separately']:
data['Freight_cost_special'].loc[i]=data['Freight Cost (USD)'].loc[i]
elif data['Freight Cost (USD)'].loc[i].split(" ")[0]=="See":
data['Freight_cost_special'].loc[i]="See ASN/DN Tag"
else :
data['Freight_cost_special'].loc[i]="Normal Measurement"
# For Shipment weight
data['Weight_special']=0
for i in range(data.shape[0]):
if data['Weight (Kilograms)'].loc[i] in ["Weight Captured Separately"]:
data['Weight_special'].loc[i]=data['Weight (Kilograms)'].loc[i]
elif data['Weight (Kilograms)'].loc[i].split(" ")[0]=="See":
data['Weight_special'].loc[i]="See ASN/DN Tag"
else :
data['Weight_special'].loc[i]="Normal Measurement"
# For PQ First Sent to Client Date
data['PQ_date_sent']=0
for i in range(data.shape[0]):
if data['PQ First Sent to Client Date'].loc[i] in ['Pre-PQ Process','Date Not Captured']:
data['PQ_date_sent'].loc[i]=data['PQ First Sent to Client Date'].loc[i]
else :
data['PQ_date_sent'].loc[i]="Date Captured"
# For PO Sent to Vendor Date
data['PO_date_sent']=0
for i in range(data.shape[0]):
if data['PO Sent to Vendor Date'].loc[i] in ['N/A - From RDC' ,'Date Not Captured']:
data['PO_date_sent'].loc[i]=data['PO Sent to Vendor Date'].loc[i]
else :
data['PO_date_sent'].loc[i]="Date Captured"
# Converting data types to Datetime
# For PQ First Sent to Client Date & For PO Sent to Vendor Date --- approximately 30-50% data is missing and the dates are not sequential , so we can't
# impute it in any way . So , after we have captured the importance of missing dates in previous cell . We will drop the two columns----if we had the all dates , then we could have found another feature
for i in ['Scheduled Delivery Date', 'Delivered to Client Date','Delivery Recorded Date']:
data[i]=pd.to_datetime(data[i], infer_datetime_format=True)
# Converting data types to Numeric
for i in ['Weight (Kilograms)', 'Freight Cost (USD)']:
data[i]=pd.to_numeric(data[i],errors='coerce')
# Rearranging data
# and excluding the columns - 'ID', 'Project Code' ,'PQ' , 'PO/SO' , 'ASD/DN' , 'PQ First Sent to Client Date', 'For PO Sent to Vendor Date'
data=data[['Country','Vendor','Manufacturing Site', 'Brand', 'Item Description','Product Group', 'Sub Classification',
'Molecule/Test Type','Dosage Form','Dosage','Managed By', 'Vendor INCO Term','Fulfill Via', 'Shipment Mode',
'Scheduled Delivery Date', 'Delivered to Client Date','Delivery Recorded Date', 'Unit of Measure (Per Pack)',
'Line Item Quantity','Line Item Value', 'Pack Price', 'Unit Price','First Line Designation', 'Weight (Kilograms)',
'Weight_special', 'Freight Cost (USD)', 'Freight_cost_special','Line Item Insurance (USD)']]
data.head()
# Now , all dtypes are correct
pd.DataFrame(data=data.dtypes.values,index=data.dtypes.index).T
# getting index of object type column names
categ_index=data.dtypes[data.dtypes=='object'].index
# getting index of numeric type columns
floats=data.dtypes[data.dtypes=='float64'].index
ints=data.dtypes[data.dtypes=='int64'].index
num_index=floats.append(ints)
# getting index of datetime columns
date_index=data.dtypes[data.dtypes=='datetime64[ns]'].index
# Statistical properties of numerical columns
data[num_index].describe()
# Statistical properties of categorical columns
data[categ_index].describe()
# Checking Distribution of data in columns
fig,ax=plt.subplots(4,2,figsize=(30,15))
i,j=0,0
for col in num_index:
sns.kdeplot(data[col],ax=ax[i,j],shade=True,linewidth=5,)
j+=1
if j==2:
j=0
i+=1
# Checking Normality of distribution using Normal Probability Plot
from scipy import stats
fig,ax=plt.subplots(4,2,figsize=(30,15))
plt.subplots_adjust(
wspace=0.7,
hspace=1.1)
a,b=0,0
for col in num_index:
stats.probplot(data[col],plot=ax[a,b])
ax[a,b].set_title(col)
b+=1
if b==2:
b=0
a+=1
# It is observed that most of the features are Right skewed (might have outliers in the right tail)
# Visualizing Correlation b/w features ,using Scatter Plots
sns.pairplot(data[num_index])
fig,ax=plt.subplots(1,3,figsize=(30,8))
# Top 10 Manufacturing sites by Total Price/Value of Packs produced
data.groupby(['Manufacturing Site'])['Line Item Value'].sum().nlargest(10).plot(kind='bar',ax=ax[0])
ax[0].set_ylabel('Shipment Value')
# Top 10 Countries by Total Price/Value of Packs produced
data.groupby(['Country'])['Line Item Value'].sum().nlargest(10).plot(kind='bar',ax=ax[1])
ax[1].set_ylabel('Shipment Value')
# Top 10 Vendors sites by Total Price/Value of Packs produced
data.groupby(['Vendor'])['Line Item Value'].sum().nlargest(10).plot(kind='bar',ax=ax[2])
ax[2].set_ylabel('Shipment Value')
fig,ax=plt.subplots(1,3,figsize=(25,5))
data.groupby(['Shipment Mode'])['Line Item Value'].sum().nlargest(5).plot(kind='bar',ax=ax[0])
ax[0].set_ylabel('Shipment Value')
data.groupby(['Dosage Form'])['Line Item Value'].sum().nlargest(5).plot(kind='bar',ax=ax[1])
ax[1].set_ylabel('Shipment Value')
data.groupby(['Item Description'])['Line Item Value'].sum().nlargest(5).plot(kind='bar',ax=ax[2])
plt.xticks(rotation=15)
ax[2].set_ylabel('Shipment Value')
fig,ax=plt.subplots(1,3,figsize=(25,5))
plt.subplots_adjust( wspace=0.5)
sns.barplot('Shipment Mode','Freight Cost (USD)',data=data,ax=ax[0])
sns.barplot('Product Group','Freight Cost (USD)',data=data,ax=ax[1])
sns.barplot('Weight_special','Freight Cost (USD)',data=data,ax=ax[2])
plt.xticks(rotation=15)
# Checking for Null values
pd.DataFrame(data=data.isnull().sum().values,index=data.isnull().sum().index).T
# Getting Columns with null values
null_val_cols=(data.isnull().sum()[data.isnull().sum()>0]).index
null_val_cols
# Random Sample Imputation for Categorical Columns
for col in ['Dosage', 'Shipment Mode']:
rand_samples=data[col].dropna().sample(data[col].isnull().sum())
rand_samples.index=data[data[col].isnull()].index
data.loc[data[col].isnull(),col]=rand_samples
# KNN Imputation for numerical columns
from sklearn.impute import KNNImputer
for col in ['Weight (Kilograms)', 'Freight Cost (USD)','Line Item Insurance (USD)'] :
imputer=KNNImputer(n_neighbors=10)
data[col]=imputer.fit_transform(data[[col]])
pd.DataFrame(data=data.isnull().sum().values,index=data.isnull().sum().index).T
data.shape
# Encoding Categorical Columns
# frequency encode - 'Country', 'Vendor', 'Manufacturing Site', 'Brand', 'Item Description','Product Group', 'Sub Classification', 'Molecule/Test Type','Dosage Form',
# 'Dosage', 'Managed By', 'Vendor INCO Term', 'Shipment Mode', 'Weight_special', 'Freight_cost_special'
# one hot encoding--'Fulfill Via','First Line Designation'
# ordinal encoding --- can be done in inco terms , shipment mode , -- domain knowledge is needed , moreover wedon't know personal preferences of the distributor , which might benefit him
"""for cols in categ_index:
print(f"\n {data[cols].value_counts().head(10)}")"""
# Observed that in all the categorical columns , there are a lare number of unique values
# We would not like to include all the unique values ; rather , we encode the top 10 categories and the remaining are classified as Other
for col in categ_index:
for i in range( data.shape[0]):
if data[col].loc[i] in list(data[col].value_counts().head(10).to_dict().keys()):
continue
else :
data[col].loc[i]="Other"
for cols in categ_index:
print(f"\n {data[cols].value_counts().head(10)}")
# While Frequency encoding , manually give less frequency for "Other" category as , there are a large no. of insignificant categories which aggregated to a large no.
# We do this so that the insignificant categories don't get unnecessarily higher weightage , while training
# columns to do manual encoding changes- Country , Vendor , Manufacturing site ,Brand , Item description , Molecule/Test type,Dosage
# frequency encode - ,'Product Group', 'Sub Classification','Dosage Form','Managed By', 'Vendor INCO Term', 'Shipment Mode', 'Weight_special', 'Freight_cost_special'
country_map=data.Country.value_counts().to_dict()
country_map["Other"]=min(country_map.values())-75
data.Country=data.Country.map((country_map))
vendor_map=data.Vendor.value_counts().to_dict()
vendor_map["Other"]=2
data.Vendor=data.Vendor.map(vendor_map)
manuf_site_map=data['Manufacturing Site'].value_counts().to_dict()
manuf_site_map["Other"]=2
data['Manufacturing Site']=data['Manufacturing Site'].map(manuf_site_map)
brand_map=data.Brand.value_counts().to_dict()
brand_map["Other"]=5
data.Brand=data.Brand.map(brand_map)
item_map=data['Item Description'].value_counts().to_dict()
item_map["Other"]=1
data['Item Description']=data['Item Description'].map(item_map)
test_type_map=data['Molecule/Test Type'].value_counts().to_dict()
test_type_map["Other"]=5
data['Molecule/Test Type']=data['Molecule/Test Type'].map(test_type_map)
dosage_map=data.Dosage.value_counts().to_dict()
dosage_map["Other"]=2
data.Dosage=data.Dosage.map(dosage_map)
prod_group_map=data['Product Group'].value_counts().to_dict()
data['Product Group']=data['Product Group'].map(prod_group_map)
sub_class_map=data['Sub Classification'].value_counts().to_dict()
data['Sub Classification']=data['Sub Classification'].map(sub_class_map)
dosage_form_map=data['Dosage Form'].value_counts().to_dict()
data['Dosage Form']=data['Dosage Form'].map(dosage_form_map)
managed_by_map=data['Managed By'].value_counts().to_dict()
data['Managed By']=data['Managed By'].map(managed_by_map)
inco_term_map=data['Vendor INCO Term'].value_counts().to_dict()
data['Vendor INCO Term']=data['Vendor INCO Term'].map(inco_term_map)
shipment_mode_map=data['Shipment Mode'].value_counts().to_dict()
data['Shipment Mode']=data['Shipment Mode'].map(shipment_mode_map)
wt_spcl_map=data['Weight_special'].value_counts().to_dict()
data['Weight_special']=data['Weight_special'].map(wt_spcl_map)
freight_spcl_map=data['Freight_cost_special'].value_counts().to_dict()
data['Freight_cost_special']=data['Freight_cost_special'].map(freight_spcl_map)
# one hot encoding--'Fulfill Via','First Line Designation'
data['Fulfill Via']=pd.get_dummies(data['Fulfill Via'],drop_first=True,dtype='float64') #---From RDC-1 ; Direct Drop-0
data['First Line Designation']=pd.get_dummies(data['First Line Designation'],drop_first=True,dtype='float64') #---Yes-1 ; No-0
data
# Outlier detection , feature selection/creation , model testing
fig,ax=plt.subplots(4,2,figsize=(25,10),squeeze=False)
plt.subplots_adjust(
wspace=0.7,
hspace=1.1)
i,j=0,0
for col in num_index:
sns.boxplot(x=data[col],ax=ax[i,j],whis=3,linewidth=3,)
j+=1
if j==2:
j=0
i+=1
# we should think carefully about what is an outlier, why is it an outlier, and what should you do with the data if you choose to keep the outliers in the data.
# Note that "outliers" are rare. If they are not rare, they are either no outliers or your method/data collection is seriousely flawed
# It is important to figure out if a certain data value is a "true outlier" or a "influence or leverage point".
# Is it due to some measurement error or is it maybe due to excessive variability in the data?
# 1e6 - 10^6
# Manually set outlier thresholds above which data is considered as outlier , based on sparsity and rarity
# Line Item Value - 2800000
# Pack Price - 200
# Unit Price - 30
# Weight - 100000
# Freight Cost - 120000
# Line Item Insurance - 3700
# Line Item Quantity - 270000
num_index
# Unit of Measure(per pack)--talthough we see outliers here , they might not be actually outliers. Since , Packs can be filled with pills , test kits or a combo of both ;
# since both occupy spaces varyingly , this leads too excessive variability in data.So ,we dont remove outliers here
# Freight Cost - This can excessively vary depending on the distance and route of shipment for the particular project.However ,we can't call them as outliers as they are
# not rare and contain important information .Hence , we don't completely impute the outliers
# We impute those points as outliers , after which the data points become very sparse ; so that our model doesn't overfit
# Similarly for all other columns , they definitely have some important information . Hence , we don't completely impute the outliers
# We manually impute those points as outliers , after which the data points become very sparse ; so that our model doesn't overfit
# Outlier Imputation for Line Item Value
b=0
for idx,j in enumerate(data['Line Item Value']):
if j>2800000:
b+=1
data.loc[idx,'Line Item Value']=data['Line Item Value'].mean()
print("\n'Line Item Value'")
print(f"No. of outliers :{b}")
# Outlier Imputation for Pack Price
b=0
for idx,j in enumerate(data['Pack Price']):
if j>200:
b+=1
data.loc[idx,'Pack Price']=data['Pack Price'].mean()
print("\n'Pack Price'")
print(f"No. of outliers :{b}")
# Outlier Imputation for Unit Price
b=0
for idx,j in enumerate(data['Unit Price']):
if j>30:
b+=1
data.loc[idx,'Unit Price']=data['Unit Price'].mean()
print("\n'Unit Price'")
print(f"No. of outliers :{b}")
# Outlier Imputation for Weight
b=0
for idx,j in enumerate(data['Weight (Kilograms)']):
if j>100000:
b+=1
data.loc[idx,'Weight (Kilograms)']=data['Weight (Kilograms)'].mean()
print("\n'Weight (Kilograms)'")
print(f"No. of outliers :{b}")
# Outlier Imputation for Freight Cost
b=0
for idx,j in enumerate(data['Freight Cost (USD)']):
if j>120000:
b+=1
data.loc[idx,'Freight Cost (USD)']=data['Freight Cost (USD)'].mean()
print("\n'Freight Cost (USD)'")
print(f"No. of outliers :{b}")
# Outlier Imputation for Line Item Insurance
b=0
for idx,j in enumerate(data['Line Item Insurance (USD)']):
if j>3700:
b+=1
data.loc[idx,'Line Item Insurance (USD)']=data['Line Item Insurance (USD)'].mean()
print("\n'Line Item Insurance (USD)'")
print(f"No. of outliers :{b}")
# Outlier Imputation for Line Item Quantity
b=0
for idx,j in enumerate(data['Line Item Quantity']):
if j>270000:
b+=1
data.loc[idx,'Line Item Quantity']=data['Line Item Quantity'].mean()
print("\n'Line Item Quantity'")
print(f"No. of outliers :{b}")
fig,ax=plt.subplots(4,2,figsize=(25,10),squeeze=False)
plt.subplots_adjust(
wspace=0.7,
hspace=1.1)
i,j=0,0
for col in num_index:
sns.boxplot(x=data[col],ax=ax[i,j],whis=3,linewidth=3,)
j+=1
if j==2:
j=0
i+=1
# Feature Creation & Target Column Creation
# New Features - important KPI's in logistics )
# --------- Delay in Delivery
data["dlvry_delay"]=data['Delivered to Client Date']-data['Scheduled Delivery Date']
# +ve days denotes Late delivery
# --------- Delay in Delivery_verification_time
data["dlvry_verif_time_delay"]=data['Delivery Recorded Date']-data['Delivered to Client Date']
# time taken to place order , after Price Quote ws received by Client
#data['PQ First Sent to Client Date']-data['PO Sent to Vendor Date']
# value of goods = ( Line Item Value + Line Item insurance )/Line Item Quantity -------- assigns higher value to sales driving shipments,rather than volume driving shipments
data["item_value"]=(data['Line Item Value']+data['Line Item Insurance (USD)'])/data['Line Item Quantity']
# Target column
data["Shipment_Price"]=data['Freight Cost (USD)']+data['Line Item Value']+data['Line Item Insurance (USD)']
# Converting nanoseconds to days
data["dlvry_delay"]=pd.to_numeric(data.dlvry_delay)/(60*60*24*(10**9))
data["dlvry_verif_time_delay"]=pd.to_numeric(data.dlvry_verif_time_delay)/(60*60*24*(10**9))
data.columns
# Rearranging data and deleting date columns
data=data[['Country', 'Vendor', 'Manufacturing Site', 'Brand', 'Item Description',
'Product Group', 'Sub Classification', 'Molecule/Test Type',
'Dosage Form', 'Dosage', 'Managed By', 'Vendor INCO Term',
'Fulfill Via', 'Shipment Mode', 'dlvry_delay','dlvry_verif_time_delay',
'Unit of Measure (Per Pack)', 'Line Item Quantity', 'Line Item Value',
'Pack Price', 'Unit Price', 'First Line Designation',
'Weight (Kilograms)', 'Weight_special', 'Freight Cost (USD)',
'Freight_cost_special', 'Line Item Insurance (USD)', 'item_value', 'Shipment_Price']]
data.head()
# Feature Removal
plt.figure(figsize=(10,10))
sns.heatmap(data.drop(['Shipment_Price'],axis=1).corr())
# Finding Higly Correlated Columns
def correlation(data,threshold):
col_corr=set()
cor=data.corr()
for i in range(len(cor.columns)):
for j in range(len(cor.columns)):
if (abs(cor.iloc[i,j]) > threshold) and i!=j:
if (cor.columns[j] in col_corr) or (cor.columns[i] in col_corr):
continue
print("\n",cor.columns[i],"-----",cor.columns[j])
print(abs(cor.iloc[i,j]))
colname=cor.columns[i]
col_corr.add(colname)
return col_corr
a=correlation(data.drop(['Shipment_Price'],axis=1),0.9)
print('\n',a)
# Removing the highly correlated columns - {'Vendor INCO Term', 'Vendor', 'Line Item Value', 'Weight_special'}
data=data.drop(['Vendor INCO Term', 'Vendor', 'Line Item Value', 'Weight_special'],axis=1)
data.head()
# Clustering data to fit separate models on the different clusters , to check if model generalises better after clustering , or as a whole
# Finding Optimal No. of Clusters
from sklearn.cluster import KMeans
inertia=[]
for i in range(1,10):
kmeans=KMeans(n_clusters=i)
kmeans.fit(data)
inertia.append(kmeans.inertia_)
elbow=pd.DataFrame({'Cluster':range(1,10),'Inertia':inertia})
plt.plot(elbow.Cluster,elbow.Inertia)
# From the elbow curve above , it seems reasonable to choose 3 clusters
kmeans=KMeans(n_clusters=3)
kmeans.fit(data)
data['Cluster']=kmeans.predict(data)
data.head()
data.Cluster.value_counts()
# Separating the 3 data clusters
data_c1=data[data.Cluster==0].drop(['Cluster'],axis=1).copy(deep=True)
data_c2=data[data.Cluster==1].drop(['Cluster'],axis=1).copy(deep=True)
data_c3=data[data.Cluster==2].drop(['Cluster'],axis=1).copy(deep=True)
# Function to calculate ajusted r2 score
def adjusted_r2(r2,data):
score=1-(1-r2)*(data.shape[0]-1)/(data.shape[0]-data.shape[1]-1)
return score
# Regression models to try
# Linear regression , lasso / regularised ,SVM regressor, KNN , rand forest regressor , XGB regressor
#Trying LInear Regression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score , RepeatedStratifiedKFold
from sklearn.linear_model import LinearRegression
scores_linreg=[]
for i in [data,data_c1,data_c2,data_c3]:
X=i.drop(['Shipment_Price'],axis=1)
y=i['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)
linreg=LinearRegression()
linreg.fit(X_train,y_train)
r2=linreg.score(X_test,y_test)
adj_r2=adjusted_r2(r2=r2,data=i)
scores_linreg.append((adj_r2))
print(f"Adj r2 score for Entire Data : {scores_linreg[0]}")
print(f"Adj r2 score for Data Cluster 1 : {scores_linreg[1]}")
print(f"Adj r2 score for Data Cluster 2 : {scores_linreg[2]}")
print(f"Adj r2 score for Data Cluster 3 : {scores_linreg[3]}")
#Trying Out Elastic Net Regression
from sklearn.linear_model import ElasticNetCV ,ElasticNet
scores_elasticreg=[]
for i in [data,data_c1,data_c2,data_c3]:
X=i.drop(['Shipment_Price'],axis=1)
y=i['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
elasticCV=ElasticNetCV(alphas=None,cv=10)
elasticCV.fit(X_train,y_train)
elastic_net_reg=ElasticNet(alpha=elasticCV.alpha_,l1_ratio=0.5)
elastic_net_reg.fit(X_train,y_train)
r2=elastic_net_reg.score(X_test,y_test)
adj_r2=adjusted_r2(r2=r2,data=i)
scores_elasticreg.append((adj_r2))
print(f"r2 score for Entire Data : {scores_elasticreg[0]}")
print(f"r2 score for Data Cluster 1 : {scores_elasticreg[1]}")
print(f"r2 score for Data Cluster 2 : {scores_elasticreg[2]}")
print(f"r2 score for Data Cluster 3 : {scores_elasticreg[3]}")
#Trying Out Elastic Net Regression
from sklearn.linear_model import RidgeCV ,Ridge
scores_ridgereg=[]
for i in [data,data_c1,data_c2,data_c3]:
X=i.drop(['Shipment_Price'],axis=1)
y=i['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
alphas=np.random.uniform(low=0,high=10,size=(50,))
ridgeCV=RidgeCV(alphas=alphas,cv=10)
ridgeCV.fit(X_train,y_train)
ridge_reg=Ridge(alpha=ridgeCV.alpha_)
ridge_reg.fit(X_train,y_train)
r2=ridge_reg.score(X_test,y_test)
adj_r2=adjusted_r2(r2=r2,data=i)
scores_ridgereg.append((adj_r2))
print(f"r2 score for Entire Data : {scores_ridgereg[0]}")
print(f"r2 score for Data Cluster 1 : {scores_ridgereg[1]}")
print(f"r2 score for Data Cluster 2 : {scores_ridgereg[2]}")
print(f"r2 score for Data Cluster 3 : {scores_ridgereg[3]}")
#Trying Out Support Vector Regression
from sklearn.svm import LinearSVR,SVR
from sklearn.preprocessing import StandardScaler
scaler_svm=StandardScaler()
scores_SVM_reg=[]
for i in [data,data_c1,data_c2,data_c3]:
#scaler_svm.fit(i.drop(['Shipment_Price'],axis=1))
#X=scaler_svm.transform(i.drop(['Shipment_Price'],axis=1))
X=i.drop(['Shipment_Price'],axis=1)
y=i['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
svm_reg=SVR(kernel="rbf")
svm_reg.fit(X_train,y_train)
r2=svm_reg.score(X_test,y_test)
adj_r2=adjusted_r2(r2=r2,data=i)
scores_SVM_reg.append((adj_r2))
print(f"r2 score for Entire Data : {scores_SVM_reg[0]}")
print(f"r2 score for Data Cluster 1 : {scores_SVM_reg[1]}")
print(f"r2 score for Data Cluster 2 : {scores_SVM_reg[2]}")
print(f"r2 score for Data Cluster 3 : {scores_SVM_reg[3]}")
#Trying Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor , plot_tree
from sklearn.metrics import r2_score
scores_dtree_reg=[]
for i in [data,data_c1,data_c2,data_c3]:
X=i.drop(['Shipment_Price'],axis=1)
y=i['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
dtree_reg=DecisionTreeRegressor()
dtree_reg.fit(X_train,y_train)
pred=dtree_reg.predict(X_test)
r2=r2_score(y_test,pred)
adj_r2=adjusted_r2(r2=r2,data=i)
scores_dtree_reg.append((adj_r2))
print(f"r2 score for Entire Data : {scores_dtree_reg[0]}")
print(f"r2 score for Data Cluster 1 : {scores_dtree_reg[1]}")
print(f"r2 score for Data Cluster 2 : {scores_dtree_reg[2]}")
print(f"r2 score for Data Cluster 3 : {scores_dtree_reg[3]}")
#Trying Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
scores_rand_forest=[]
for i in [data,data_c1,data_c2,data_c3]:
X=i.drop(['Shipment_Price'],axis=1)
y=i['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
rand_forest=RandomForestRegressor(min_samples_leaf=10,n_estimators=30)
rand_forest.fit(X_train,y_train)
pred=rand_forest.predict(X_test)
r2=r2_score(y_test,pred)
adj_r2=adjusted_r2(r2=r2,data=i)
scores_rand_forest.append((adj_r2))
print(f"r2 score for Entire Data : {scores_rand_forest[0]}")
print(f"r2 score for Data Cluster 1 : {scores_rand_forest[1]}")
print(f"r2 score for Data Cluster 2 : {scores_rand_forest[2]}")
print(f"r2 score for Data Cluster 3 : {scores_rand_forest[3]}")
#Trying XGBoost Regressor
from xgboost import XGBRegressor
scores_xgb=[]
for i in [data,data_c1,data_c2,data_c3]:
X=i.drop(['Shipment_Price'],axis=1)
y=i['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
xgb_reg=XGBRegressor(subsample=0.7,max_depth=9,colsample_bytree=0.8,eta=0.1)
xgb_reg.fit(X_train,y_train)
pred=xgb_reg.predict(X_test)
r2=r2_score(y_test,pred)
adj_r2=adjusted_r2(r2=r2,data=i)
scores_xgb.append((adj_r2))
print(f"r2 score for Entire Data : {scores_xgb[0]}")
print(f"r2 score for Data Cluster 1 : {scores_xgb[1]}")
print(f"r2 score for Data Cluster 2 : {scores_xgb[2]}")
print(f"r2 score for Data Cluster 3 : {scores_xgb[3]}")
#Trying KNN Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
scaler_knn=StandardScaler()
scores_KNN_reg=[]
for i in [data,data_c1,data_c2,data_c3]:
scaler_knn.fit(i.drop(['Shipment_Price'],axis=1))
X=scaler_knn.transform(i.drop(['Shipment_Price'],axis=1))
y=i['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
knn=KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train,y_train)
r2=knn.score(X_test,y_test)
adj_r2=adjusted_r2(r2=r2,data=i)
scores_KNN_reg.append((adj_r2))
print(f"r2 score for Entire Data : {scores_KNN_reg[0]}")
print(f"r2 score for Data Cluster 1 : {scores_KNN_reg[1]}")
print(f"r2 score for Data Cluster 2 : {scores_KNN_reg[2]}")
print(f"r2 score for Data Cluster 3 : {scores_KNN_reg[3]}")
scores=pd.DataFrame([scores_linreg,scores_elasticreg,scores_ridgereg,scores_SVM_reg,scores_dtree_reg,scores_rand_forest,scores_KNN_reg,scores_xgb]
,index=['Linreg-r2','Elastic-net r2','Ridge-r2','SVR-r2','Dtree-r2','Rand-Forest-r2','KNN-r2',"XGB-r2"],
columns=['Entire Data','Cluster 1','Cluster 2','Cluster 3'])
scores
for i in scores.columns:
print(f"Best Model for {i} : {scores[i].idxmax()} ")
# From the above results, we can conclude that models generalise worse after clustering ,as compared to the whole dataset.
# Hence we don't need to to cluster the data
# So, Best model is XGB Regression
X=data.drop(['Shipment_Price'],axis=1)
y=data['Shipment_Price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
xgb_reg=XGBRegressor(subsample=0.7,max_depth=9,colsample_bytree=0.8,eta=0.1)
xgb_reg.fit(X_train,y_train)
import pickle
filename="xgboost.pickle"
pickle.dump(xgb_reg,open(filename,'wb'))
model=pickle.load(open(filename,'rb'))
pred=model.predict(X_test)
r2_score(y_test,pred)