import numpy as np   
import pandas as pd            
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


df = pd.read_csv('./hotel_bookings.csv')
df.head()


# check null values percentage
df.isnull().mean()

hotel                             0.000000
is_canceled                       0.000000
lead_time                         0.000000
arrival_date_year                 0.000000
arrival_date_month                0.000000
arrival_date_week_number          0.000000
arrival_date_day_of_month         0.000000
stays_in_weekend_nights           0.000000
stays_in_week_nights              0.000000
adults                            0.000000
children                          0.000034
babies                            0.000000
meal                              0.000000
country                           0.004087
market_segment                    0.000000
distribution_channel              0.000000
is_repeated_guest                 0.000000
previous_cancellations            0.000000
previous_bookings_not_canceled    0.000000
reserved_room_type                0.000000
assigned_room_type                0.000000
booking_changes                   0.000000
deposit_type                      0.000000
agent                             0.136862
company                           0.943069
days_in_waiting_list              0.000000
customer_type                     0.000000
adr                               0.000000
required_car_parking_spaces       0.000000
total_of_special_requests         0.000000
reservation_status                0.000000
reservation_status_date           0.000000
dtype: float64


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal                            119390 non-null  object 
 13  country                         118902 non-null  object 
 14  market_segment                  119390 non-null  object 
 15  distribution_channel            119390 non-null  object 
 16  is_repeated_guest               119390 non-null  int64  
 17  previous_cancellations          119390 non-null  int64  
 18  previous_bookings_not_canceled  119390 non-null  int64  
 19  reserved_room_type              119390 non-null  object 
 20  assigned_room_type              119390 non-null  object 
 21  booking_changes                 119390 non-null  int64  
 22  deposit_type                    119390 non-null  object 
 23  agent                           103050 non-null  float64
 24  company                         6797 non-null    float64
 25  days_in_waiting_list            119390 non-null  int64  
 26  customer_type                   119390 non-null  object 
 27  adr                             119390 non-null  float64
 28  required_car_parking_spaces     119390 non-null  int64  
 29  total_of_special_requests       119390 non-null  int64  
 30  reservation_status              119390 non-null  object 
 31  reservation_status_date         119390 non-null  object 
dtypes: float64(4), int64(16), object(12)
memory usage: 29.1+ MB


df.describe([0.01,0.05,0.1,0.25,0.5,0.75,0.99]).T


df.hist(figsize=(20,15))
plt.show()


plt.figure(figsize=(15,8))
sns.countplot(x='hotel'
             ,data=df
             ,hue='is_canceled'
             ,palette=sns.color_palette('Set2',2)
            )

<Axes: xlabel='hotel', ylabel='count'>


hotel_cancellation_num = df.loc[df['is_canceled'] == 1]['hotel'].value_counts()
hotel_num = df['hotel'].value_counts()
hotel_cancellation_ratio = (hotel_cancellation_num / hotel_num).sort_values(ascending=False)

print('Hotel Cancellation Ratio'.center(20), hotel_cancellation_ratio, sep='\n')

Hotel Cancellation Ratio
City Hotel      0.417270
Resort Hotel    0.277634
Name: hotel, dtype: float64


booking_city_hotel = df[(df['hotel']=='City Hotel') & (df['is_canceled']==0)]
booking_resort_hotel = df[(df['hotel']=='Resort Hotel') & (df['is_canceled']==0)]


booking_city_hotel['arrival_date_year_month'] = booking_city_hotel['arrival_date_year'].astype(str) + ' ' + booking_city_hotel['arrival_date_month'].str[:3]


booking_city_hotel['arrival_date_year_month'] = booking_city_hotel['arrival_date_year'].astype(str) + ' ' + booking_city_hotel['arrival_date_month'].str[:3]
booking_resort_hotel['arrival_date_year_month'] = booking_resort_hotel['arrival_date_year'].astype(str) + ' ' + booking_resort_hotel['arrival_date_month'].str[:3]


city_month = booking_city_hotel['arrival_date_year_month'].value_counts()
resort_month = booking_resort_hotel['arrival_date_year_month'].value_counts()

month = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
year = ['2015', '2016', '2017']
month_index = [year[0]+" " +a for a in month[6:]] + [year[1]+" "+a for a in month] + [year[2]+" "+a for a in month[:8]]

city_month_new = pd.DataFrame({'month_index':month_index})
city_month_new['bookings'] = city_month_new['month_index'].map(city_month.to_dict())
resort_month_new = pd.DataFrame({'month_index':month_index})
resort_month_new['bookings'] = resort_month_new['month_index'].map(resort_month.to_dict())

name = []
for i in range(len(city_month_new.month_index)):
    if i%2 == 0:
        name.append(city_month_new.month_index[i])
    else:
        name.append('')

x = city_month_new.month_index
y1 = city_month_new.bookings
y2 = resort_month_new.bookings

plt.figure(figsize=(15,8),dpi=80)
plt.plot(x,y1,label='City Hotel',color='lightsalmon')
plt.plot(x,y2,label='Resort Hotel',color='lightseagreen')
plt.xticks(x,name,fontsize=10)
plt.legend(fontsize=14)
plt.xlabel('Month', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Hotel Monthly Booking', fontsize=20)
for i,j in zip(x,y1):
    plt.text(i,j,'%d' % j,ha = 'center',va = 'bottom', fontsize=10)
    
for i,j in zip(x,y2):
    plt.text(i,j,'%d' % j,ha = 'center',va = 'top', fontsize=10)


country_booking = df['country'].value_counts()[:10]
country_cancellation = df[(df.country.isin (country_booking.index)) & (df.is_canceled==1)]['country'].value_counts()

plt.figure(figsize=(15,8))
sns.set(font_scale=1.5)
sns.countplot(x='country'
              ,data=df[df.country.isin (country_booking.index)]
              ,hue='is_canceled'
              ,palette=sns.color_palette('Set2',2)
             )
plt.title('Top 10 Booking Countries', fontsize=20)

Text(0.5, 1.0, 'Top 10 Booking Countries')


country_cancellation_ratio=(country_cancellation/country_booking).sort_values(ascending=False)
print('Cancellation Ratio by Country'.center(10),country_cancellation_ratio,sep='\n')

Cancellation Ratio by Country
PRT    0.566351
BRA    0.373201
ITA    0.353956
ESP    0.254085
IRL    0.246519
BEL    0.202391
GBR    0.202243
FRA    0.185694
NLD    0.183935
DEU    0.167147
Name: country, dtype: float64


city_customer = booking_city_hotel['customer_type'].value_counts()
resort_customer = booking_resort_hotel['customer_type'].value_counts()

plt.figure(figsize=(21,12),dpi=80)
plt.subplot(1,2,1)
plt.pie(city_customer,labels=city_customer.index,autopct='%.2f%%')
plt.legend(loc=1)
plt.title('City Hotel Customer Type')
plt.subplot(1,2,2)
plt.pie(resort_customer,labels=resort_customer.index,autopct='%.2f%%')
plt.title('Resort Hotel Customer Type')
plt.legend()
plt.show()


city_segment = booking_city_hotel.market_segment.value_counts()
resort_segment = booking_resort_hotel.market_segment.value_counts()

plt.figure(figsize=(21,12),dpi=80)
plt.subplot(1,2,1)
plt.pie(city_segment,labels=city_segment.index,autopct='%.2f%%')
plt.legend()
plt.title('City Hotel Market Segment')
plt.subplot(1,2,2)
plt.pie(resort_segment,labels=resort_segment.index,autopct='%.2f%%')
plt.title('Resort Hotel Market Segment')
plt.legend()
plt.show()


plt.figure(figsize=(15,8))
sns.boxplot(x='customer_type'
            ,y='adr'
            ,hue='hotel'
            ,data=df[df.is_canceled==0]
            ,palette=sns.color_palette('Set2',2)
           )
plt.title('Average Daily Rate of Different Customer Type')

Text(0.5, 1.0, 'Average Daily Rate of Different Customer Type')


plt.figure(figsize=(15,8))
sns.countplot(x='is_repeated_guest'
              ,data=df
              ,hue='is_canceled'
              ,palette=sns.color_palette('Set2',2)
             )
plt.title('New/Repeated Guest Amount')
plt.xticks(range(2),['no','yes'])

([<matplotlib.axis.XTick at 0x21cc76079d0>,
  <matplotlib.axis.XTick at 0x21cc8ead110>],
 [Text(0, 0, 'no'), Text(1, 0, 'yes')])


guest_cancellation = (df.loc[df['is_canceled']==1]['is_repeated_guest'].value_counts()/df['is_repeated_guest'].value_counts()).sort_values(ascending=False)
guest_cancellation.index=['New Guest', 'Repeated Guest']
print('New/Repeated Guest Cancellation Ratio'.center(15),guest_cancellation,sep='\n')

New/Repeated Guest Cancellation Ratio
New Guest         0.377851
Repeated Guest    0.144882
Name: is_repeated_guest, dtype: float64


print('Deposit types and number of bookings'.center(15),df['deposit_type'].value_counts(),sep='\n')

Deposit types and number of bookings
No Deposit    104641
Non Refund     14587
Refundable       162
Name: deposit_type, dtype: int64


deposit_cancellation = (df.loc[df['is_canceled']==1]['deposit_type'].value_counts()/df['deposit_type'].value_counts()).sort_values(ascending=False)

plt.figure(figsize=(8,5))
x=range(len(deposit_cancellation.index))
y=deposit_cancellation.values
plt.bar(x,y,label='Cancellation_Rate',color=['orangered','lightsalmon','lightseagreen'],width=0.4)
plt.xticks(x,deposit_cancellation.index)
plt.legend()
plt.title('Cancellation Ratio of Deposite Type')
for x,y in zip(x,y):
    plt.text(x,y,'%.2f' % y,ha = 'center',va = 'bottom')


plt.figure(figsize=(15,8))
sns.countplot(x='assigned_room_type'
              ,data=df
              ,hue='is_canceled'
              ,palette=sns.color_palette('Set2',2)
              ,order = df['assigned_room_type'].value_counts().index
             )
plt.title('Booking & Cancellation Amount of Room Type')

Text(0.5, 1.0, 'Booking & Cancellation Amount of Room Type')


room_cancellation = df.loc[df['is_canceled']==1]['assigned_room_type'].value_counts()[:5]/df['assigned_room_type'].value_counts()[:5]
print('Top 5 cancelled room types'.center(5),room_cancellation.sort_values(ascending=False),sep='\n')

Top 5 cancelled room types
A    0.444925
G    0.305523
E    0.252114
D    0.251244
F    0.247134
Name: assigned_room_type, dtype: float64


df_new = df.drop(labels=['reservation_status_date'],axis=1)


df_new.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             float64
company                           float64
days_in_waiting_list                int64
customer_type                      object
adr                               float64
required_car_parking_spaces         int64
total_of_special_requests           int64
reservation_status                 object
dtype: object


text_categorical_cols = df_new.columns[df_new.dtypes=='object'].tolist()
numerical_categorical_cols = ['agent', 'company', 'is_repeated_guest']
categorical_cols = text_categorical_cols + numerical_categorical_cols


df_new.isnull().mean()

hotel                             0.000000
is_canceled                       0.000000
lead_time                         0.000000
arrival_date_year                 0.000000
arrival_date_month                0.000000
arrival_date_week_number          0.000000
arrival_date_day_of_month         0.000000
stays_in_weekend_nights           0.000000
stays_in_week_nights              0.000000
adults                            0.000000
children                          0.000034
babies                            0.000000
meal                              0.000000
country                           0.004087
market_segment                    0.000000
distribution_channel              0.000000
is_repeated_guest                 0.000000
previous_cancellations            0.000000
previous_bookings_not_canceled    0.000000
reserved_room_type                0.000000
assigned_room_type                0.000000
booking_changes                   0.000000
deposit_type                      0.000000
agent                             0.136862
company                           0.943069
days_in_waiting_list              0.000000
customer_type                     0.000000
adr                               0.000000
required_car_parking_spaces       0.000000
total_of_special_requests         0.000000
reservation_status                0.000000
dtype: float64


df_new[['agent','company']] = df_new[['agent','company']].fillna(0,axis=0)
df_new.loc[:,categorical_cols].isnull().mean()

hotel                   0.000000
arrival_date_month      0.000000
meal                    0.000000
country                 0.004087
market_segment          0.000000
distribution_channel    0.000000
reserved_room_type      0.000000
assigned_room_type      0.000000
deposit_type            0.000000
customer_type           0.000000
reservation_status      0.000000
agent                   0.000000
company                 0.000000
is_repeated_guest       0.000000
dtype: float64


df_new.loc[df_new['company'] == 0,'in_company']='NO'
df_new.loc[df_new['company'] != 0,'in_company']='YES'
df_new.loc[df_new['agent'] == 0,'in_agent']='NO'
df_new.loc[df_new['agent'] != 0,'in_agent']='YES'
df_new['is_repeated_guest'][df_new['is_repeated_guest']==0]='NO'
df_new['is_repeated_guest'][df_new['is_repeated_guest']==1]='YES'
df_new.loc[df_new['reserved_room_type'] == df_new['assigned_room_type'],'same_assignment']='YES'
df_new.loc[df_new['reserved_room_type'] != df_new['assigned_room_type'],'same_assignment']='NO'

df_new=df_new.drop(labels=['reserved_room_type','assigned_room_type','agent','company'],axis=1)
df_new[['in_agent','in_company','is_repeated_guest','same_assignment']].head()


for c in ['in_company','in_agent','same_assignment']:
    categorical_cols.append(c)

for c in ['reserved_room_type','assigned_room_type','agent','company']:
    categorical_cols.remove(c)
categorical_cols

['hotel',
 'arrival_date_month',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'deposit_type',
 'customer_type',
 'reservation_status',
 'is_repeated_guest',
 'in_company',
 'in_agent',
 'same_assignment']


# Fill ungiven 'country' attribute with the most frequent country
df_new['country']=df_new['country'].fillna(df_new['country'].mode()[0])
df_new['country'].unique()

array(['PRT', 'GBR', 'USA', 'ESP', 'IRL', 'FRA', 'ROU', 'NOR', 'OMN',
       'ARG', 'POL', 'DEU', 'BEL', 'CHE', 'CN', 'GRC', 'ITA', 'NLD',
       'DNK', 'RUS', 'SWE', 'AUS', 'EST', 'CZE', 'BRA', 'FIN', 'MOZ',
       'BWA', 'LUX', 'SVN', 'ALB', 'IND', 'CHN', 'MEX', 'MAR', 'UKR',
       'SMR', 'LVA', 'PRI', 'SRB', 'CHL', 'AUT', 'BLR', 'LTU', 'TUR',
       'ZAF', 'AGO', 'ISR', 'CYM', 'ZMB', 'CPV', 'ZWE', 'DZA', 'KOR',
       'CRI', 'HUN', 'ARE', 'TUN', 'JAM', 'HRV', 'HKG', 'IRN', 'GEO',
       'AND', 'GIB', 'URY', 'JEY', 'CAF', 'CYP', 'COL', 'GGY', 'KWT',
       'NGA', 'MDV', 'VEN', 'SVK', 'FJI', 'KAZ', 'PAK', 'IDN', 'LBN',
       'PHL', 'SEN', 'SYC', 'AZE', 'BHR', 'NZL', 'THA', 'DOM', 'MKD',
       'MYS', 'ARM', 'JPN', 'LKA', 'CUB', 'CMR', 'BIH', 'MUS', 'COM',
       'SUR', 'UGA', 'BGR', 'CIV', 'JOR', 'SYR', 'SGP', 'BDI', 'SAU',
       'VNM', 'PLW', 'QAT', 'EGY', 'PER', 'MLT', 'MWI', 'ECU', 'MDG',
       'ISL', 'UZB', 'NPL', 'BHS', 'MAC', 'TGO', 'TWN', 'DJI', 'STP',
       'KNA', 'ETH', 'IRQ', 'HND', 'RWA', 'KHM', 'MCO', 'BGD', 'IMN',
       'TJK', 'NIC', 'BEN', 'VGB', 'TZA', 'GAB', 'GHA', 'TMP', 'GLP',
       'KEN', 'LIE', 'GNB', 'MNE', 'UMI', 'MYT', 'FRO', 'MMR', 'PAN',
       'BFA', 'LBY', 'MLI', 'NAM', 'BOL', 'PRY', 'BRB', 'ABW', 'AIA',
       'SLV', 'DMA', 'PYF', 'GUY', 'LCA', 'ATA', 'GTM', 'ASM', 'MRT',
       'NCL', 'KIR', 'SDN', 'ATF', 'SLE', 'LAO'], dtype=object)


# Encode the categorical attributes

from sklearn.preprocessing import OrdinalEncoder

o_encoder = OrdinalEncoder()
df_new.loc[:,categorical_cols] = o_encoder.fit_transform(df_new.loc[:,categorical_cols])


numerical_cols = df_new.columns.tolist()
numerical_cols.remove('is_canceled')
for c in categorical_cols:
    numerical_cols.remove(c)
    
numerical_cols

['lead_time',
 'arrival_date_year',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'days_in_waiting_list',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests']


df[numerical_cols].isnull().mean()

lead_time                         0.000000
arrival_date_year                 0.000000
arrival_date_week_number          0.000000
arrival_date_day_of_month         0.000000
stays_in_weekend_nights           0.000000
stays_in_week_nights              0.000000
adults                            0.000000
children                          0.000034
babies                            0.000000
previous_cancellations            0.000000
previous_bookings_not_canceled    0.000000
booking_changes                   0.000000
days_in_waiting_list              0.000000
adr                               0.000000
required_car_parking_spaces       0.000000
total_of_special_requests         0.000000
dtype: float64


# Fill ungiven 'children' with the most frequent value
df_new['children'] = df_new['children'].fillna(df_new['children'].mode()[0])


# Use standard scaler to scale the numerical attributes

from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()
df_new.loc[:,numerical_cols] = s_scaler.fit_transform(df_new.loc[:,numerical_cols])
df_new[numerical_cols].head()


corr_matrix = df_new.corr()
corr_matrix['is_canceled'].sort_values(ascending=False)

is_canceled                       1.000000
deposit_type                      0.468634
lead_time                         0.293123
country                           0.267502
same_assignment                   0.247770
distribution_channel              0.167600
previous_cancellations            0.110133
in_agent                          0.102068
adults                            0.060017
market_segment                    0.059338
days_in_waiting_list              0.054186
adr                               0.047557
stays_in_week_nights              0.024765
arrival_date_year                 0.016660
arrival_date_week_number          0.008148
children                          0.005036
arrival_date_month               -0.001491
stays_in_weekend_nights          -0.001791
arrival_date_day_of_month        -0.006130
meal                             -0.017678
babies                           -0.032491
previous_bookings_not_canceled   -0.057358
customer_type                    -0.068140
is_repeated_guest                -0.084793
in_company                       -0.099310
hotel                            -0.136531
booking_changes                  -0.144381
required_car_parking_spaces      -0.195498
total_of_special_requests        -0.234658
reservation_status               -0.917196
Name: is_canceled, dtype: float64


corr_matrix.index

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes', 'deposit_type',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'in_company', 'in_agent', 'same_assignment'],
      dtype='object')


corr_cancel=abs(corr_matrix['is_canceled']).sort_values()
x=range(len(corr_cancel.index))
y=abs(corr_cancel.values)

sns.set(font_scale=1.25)
plt.figure(figsize=(8,15))
plt.barh(x,y,color='salmon')
plt.yticks(x,corr_cancel.index)
for x,y in zip(x,y):
    plt.text(y,x-0.1,'%.2f' % y,ha = 'center',va = 'bottom')
plt.xlabel('Corrleation')
plt.ylabel('Varriance')
plt.show()


df_cleaned = df_new.drop('reservation_status',axis=1)


x=df_cleaned.loc[:,df_cleaned.columns != 'is_canceled' ]
y=df_cleaned.loc[:,'is_canceled']

from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=90)


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score as cvs,KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score as AUC

rfc = RandomForestClassifier(n_estimators=100,random_state=90)
cv = KFold(n_splits=10, shuffle = True, random_state=90)
rfc_score = cvs(rfc,xtrain,ytrain,cv=cv).mean()
rfc.fit(xtrain,ytrain)
y_score=rfc.predict_proba(xtest)[:,1]
rfc_pred=rfc.predict(xtest)

fpr, recall, thresholds = roc_curve(ytest,y_score, pos_label=1)
rfc_auc = AUC(ytest,y_score)


plt.figure(figsize=(8,8))
plt.plot(fpr, recall, color='red',label='ROC curve (area = %0.2f)' % rfc_auc)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('Random Forest Classifier ROC Curve')
plt.legend(loc="lower right")
plt.show()


from xgboost import XGBClassifier

xgbr = XGBClassifier(n_estimators=100,random_state=90)
xgbr_score = cvs(xgbr,xtrain,ytrain,cv=cv).mean()
xgbr.fit(xtrain,ytrain)
y_score = xgbr.predict_proba(xtest)[:,1]
xgbr_pred = xgbr.predict(xtest)
fdr, recall, thresholds = roc_curve(ytest,y_score, pos_label=1)
xgbr_auc = AUC(ytest,y_score)


plt.figure(figsize=(8,8))
plt.plot(fdr, recall, color='red',label='ROC curve (area = %0.2f)' % xgbr_auc)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('XGBoost Classifier ROC Curve')
plt.legend(loc="lower right")
plt.show()


from sklearn.linear_model import LogisticRegression as LR
lr = LR(penalty='l2',solver='liblinear',max_iter=1000) 
lr_score = cvs(lr,xtrain,ytrain,cv=cv).mean()
lr.fit(xtrain,ytrain)
y_score = lr.predict_proba(xtest)[:,1]
lr_pred = lr.predict(xtest)
fdr, recall, thresholds = roc_curve(ytest,y_score, pos_label=1)
lr_auc = AUC(ytest,y_score)


plt.figure(figsize=(8,8))
plt.plot(fdr, recall, color='red',label='ROC curve (area = %0.2f)' % lr_auc)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('LogisticRegression ROC Curve')
plt.legend(loc="lower right")
plt.show()


from sklearn.metrics import classification_report
print('Random forest'.center(50), classification_report(ytest,rfc_pred),sep='\n')
print('XGBoost'.center(55),classification_report(ytest,xgbr_pred),sep='\n')
print('Logistic regression'.center(50),classification_report(ytest,lr_pred),sep='\n')

                  Random forest                   
              precision    recall  f1-score   support

           0       0.90      0.93      0.92     15153
           1       0.87      0.82      0.85      8725

    accuracy                           0.89     23878
   macro avg       0.89      0.88      0.88     23878
weighted avg       0.89      0.89      0.89     23878

                        XGBoost                        
              precision    recall  f1-score   support

           0       0.89      0.91      0.90     15153
           1       0.84      0.80      0.82      8725

    accuracy                           0.87     23878
   macro avg       0.87      0.86      0.86     23878
weighted avg       0.87      0.87      0.87     23878

               Logistic regression                
              precision    recall  f1-score   support

           0       0.80      0.91      0.85     15153
           1       0.79      0.60      0.68      8725

    accuracy                           0.80     23878
   macro avg       0.79      0.75      0.76     23878
weighted avg       0.80      0.80      0.79     23878


score={'Model_score':[rfc_score,xgbr_score,lr_score],'Auc_area':[rfc_auc,xgbr_auc,lr_auc]}
score_com=pd.DataFrame(data=score,index=['RandomForest','XGBoost','LogisticRegression'])
score_com.sort_values(by=['Model_score'],ascending=False)

	count	mean	std	min	1%	5%	10%	25%	50%	75%	99%	max
is_canceled	119390.0	0.370416	0.482918	0.00	0.0	0.0	0.0	0.00	0.000	1.0	1.00	1.0
lead_time	119390.0	104.011416	106.863097	0.00	0.0	0.0	3.0	18.00	69.000	160.0	444.00	737.0
arrival_date_year	119390.0	2016.156554	0.707476	2015.00	2015.0	2015.0	2015.0	2016.00	2016.000	2017.0	2017.00	2017.0
arrival_date_week_number	119390.0	27.165173	13.605138	1.00	2.0	5.0	8.0	16.00	28.000	38.0	53.00	53.0
arrival_date_day_of_month	119390.0	15.798241	8.780829	1.00	1.0	2.0	4.0	8.00	16.000	23.0	31.00	31.0
stays_in_weekend_nights	119390.0	0.927599	0.998613	0.00	0.0	0.0	0.0	0.00	1.000	2.0	4.00	19.0
stays_in_week_nights	119390.0	2.500302	1.908286	0.00	0.0	0.0	1.0	1.00	2.000	3.0	10.00	50.0
adults	119390.0	1.856403	0.579261	0.00	1.0	1.0	1.0	2.00	2.000	2.0	3.00	55.0
children	119386.0	0.103890	0.398561	0.00	0.0	0.0	0.0	0.00	0.000	0.0	2.00	10.0
babies	119390.0	0.007949	0.097436	0.00	0.0	0.0	0.0	0.00	0.000	0.0	0.00	10.0
is_repeated_guest	119390.0	0.031912	0.175767	0.00	0.0	0.0	0.0	0.00	0.000	0.0	1.00	1.0
previous_cancellations	119390.0	0.087118	0.844336	0.00	0.0	0.0	0.0	0.00	0.000	0.0	1.00	26.0
previous_bookings_not_canceled	119390.0	0.137097	1.497437	0.00	0.0	0.0	0.0	0.00	0.000	0.0	3.00	72.0
booking_changes	119390.0	0.221124	0.652306	0.00	0.0	0.0	0.0	0.00	0.000	0.0	3.00	21.0
agent	103050.0	86.693382	110.774548	1.00	1.0	1.0	6.0	9.00	14.000	229.0	390.00	535.0
company	6797.0	189.266735	131.655015	6.00	16.0	40.0	40.0	62.00	179.000	270.0	513.04	543.0
days_in_waiting_list	119390.0	2.321149	17.594721	0.00	0.0	0.0	0.0	0.00	0.000	0.0	75.00	391.0
adr	119390.0	101.831122	50.535790	-6.38	0.0	38.4	50.0	69.29	94.575	126.0	252.00	5400.0
required_car_parking_spaces	119390.0	0.062518	0.245291	0.00	0.0	0.0	0.0	0.00	0.000	0.0	1.00	8.0
total_of_special_requests	119390.0	0.571363	0.792798	0.00	0.0	0.0	0.0	0.00	0.000	1.0	3.00	5.0

	lead_time	arrival_date_year	arrival_date_week_number	arrival_date_day_of_month	stays_in_weekend_nights	stays_in_week_nights	adults	children	babies	previous_cancellations	previous_bookings_not_canceled	booking_changes	days_in_waiting_list	adr	required_car_parking_spaces	total_of_special_requests
0	2.227051	-1.634768	-0.012141	-1.685297	-0.92889	-1.310240	0.247897	-0.260659	-0.081579	-0.10318	-0.091555	4.260101	-0.131924	-2.015038	-0.254873	-0.720694
1	5.923385	-1.634768	-0.012141	-1.685297	-0.92889	-1.310240	0.247897	-0.260659	-0.081579	-0.10318	-0.091555	5.793131	-0.131924	-2.015038	-0.254873	-0.720694
2	-0.907814	-1.634768	-0.012141	-1.685297	-0.92889	-0.786207	-1.478447	-0.260659	-0.081579	-0.10318	-0.091555	-0.338990	-0.131924	-0.530935	-0.254873	-0.720694
3	-0.851667	-1.634768	-0.012141	-1.685297	-0.92889	-0.786207	-1.478447	-0.260659	-0.081579	-0.10318	-0.091555	-0.338990	-0.131924	-0.530935	-0.254873	-0.720694
4	-0.842309	-1.634768	-0.012141	-1.685297	-0.92889	-0.262174	0.247897	-0.260659	-0.081579	-0.10318	-0.091555	-0.338990	-0.131924	-0.075810	-0.254873	0.540666

	Model_score	Auc_area
RandomForest	0.889982	0.956306
XGBoost	0.873576	0.946361
LogisticRegression	0.796476	0.866788

Background¶

Exploratory data analysis¶

Data visualization analysis¶

Hotel bookings and cancellations¶

Hotel bookings per month¶

Guest country and cancellation ratio¶

Customer types¶

Market segment¶

Average daily rate of different customer type¶

New/Repeated Guest Amount¶

3 deposit types¶

Room types¶

Data Preprocessing¶

Categorical attributes¶

Numerical attributes¶

Attributes correlation¶

Prediction models¶

Random forest¶

XGBoost¶

Logistic regression¶

Model performance comparison¶

	hotel	lead_time	arrival_date_year	arrival_date_month	arrival_date_week_number	arrival_date_day_of_month	stays_in_week_nights	adults	...	deposit_type	agent	company	customer_type	adr	total_of_special_requests	reservation_status	reservation_status_date
0	Resort Hotel	342	2015	July	27	1	0	2	...	No Deposit	NaN	NaN	Transient	0.0	0	Check-Out	2015-07-01
1	Resort Hotel	737	2015	July	27	1	0	2	...	No Deposit	NaN	NaN	Transient	0.0	0	Check-Out	2015-07-01
2	Resort Hotel	7	2015	July	27	1	1	1	...	No Deposit	NaN	NaN	Transient	75.0	0	Check-Out	2015-07-02
3	Resort Hotel	13	2015	July	27	1	1	1	...	No Deposit	304.0	NaN	Transient	75.0	0	Check-Out	2015-07-02
4	Resort Hotel	14	2015	July	27	1	2	2	...	No Deposit	240.0	NaN	Transient	98.0	1	Check-Out	2015-07-03

	in_agent	in_company	is_repeated_guest	same_assignment
0	NO	NO	NO	YES
1	NO	NO	NO	YES
2	NO	NO	NO	NO
3	YES	NO	NO	YES
4	YES	NO	NO	YES