-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCode.py
147 lines (113 loc) · 4.92 KB
/
Code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Importing necessary libraries
import numpy as np
import pandas as pd
import os
# Input data files directory
input_dir = '/kaggle/input/spaceship-titanic'
# Listing all files in the input directory
for dirname, _, filenames in os.walk(input_dir):
for filename in filenames:
print(os.path.join(dirname, filename))
# Loading the training dataset
df = pd.read_csv(os.path.join(input_dir, 'train.csv'))
df = pd.DataFrame(df)
# Displaying the first 5 rows of the dataset
df.head(5)
# Checking the shape of the dataset
df.shape
# Counting total missing values in the dataset
total_missing_value = df.isnull().sum().sum()
total_missing_value
# Checking data types of columns
df.dtypes
# Suppressing warnings
import warnings
warnings.filterwarnings("ignore")
# Importing visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Importing TensorFlow and TensorFlow Decision Forests
import tensorflow as tf
import tensorflow_decision_forests as tfdf
# Checking TensorFlow and TensorFlow Decision Forests versions
print("TensorFlow version:", tf.__version__)
print("TensorFlow Decision Forests version:", tfdf.__version__)
# Creating a bar chart to visualize the distribution of 'Transported' column
barChart_Transported = df.Transported.value_counts()
barChart_Transported.plot(kind="bar")
plt.xlabel('Transported')
plt.ylabel('Count')
plt.title('Distribution of Transported Passengers')
plt.show()
# Dropping columns 'PassengerId' and 'Name' from the dataset
df = df.drop(['PassengerId', 'Name'], axis=1)
df.head(3)
# Handling missing values by filling NaNs with 0 for certain columns
df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(value=0)
df.isnull().sum().sort_values(ascending=False)
# Converting boolean columns to integer type
df['VIP'] = df['VIP'].astype(int)
df['CryoSleep'] = df['CryoSleep'].astype(int)
# Creating new features 'Deck', 'Cabin_num', and 'Side' from 'Cabin' column
df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
df = df.drop('Cabin', axis=1)
df.dtypes
# Splitting the dataset into training and validation sets
def split_dataset(dataset, test_ratio=0.20):
test_indices = np.random.rand(len(dataset)) < test_ratio
return dataset[~test_indices], dataset[test_indices]
train_ds_pd, valid_ds_pd = split_dataset(df)
print("{} examples in training, {} examples in testing.".format(
len(train_ds_pd), len(valid_ds_pd)))
train_ds_pd.head(2)
valid_ds_pd.head(2)
# Converting Pandas DataFrame to TensorFlow Datasets format
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label='Transported')
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label='Transported')
train_ds
# Listing available models in TensorFlow Decision Forests
tfdf.keras.get_all_models()
# Selecting RandomForestModel
Randforest_trdf = tfdf.keras.RandomForestModel(hyperparameter_template="benchmark_rank1")
# Compiling the model
Randforest_trdf.compile(metrics=["accuracy"])
# Training the model
Randforest_trdf.fit(x=train_ds)
# Evaluating the model on training dataset
logs = Randforest_trdf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")
plt.title("Training Progress")
plt.show()
# Evaluating the model on validation dataset
evaluation = Randforest_trdf.evaluate(x=valid_ds, return_dict=True)
print("Validation Accuracy:", evaluation['accuracy'])
# Retrieving model inspector to analyze variable importances
inspector = Randforest_trdf.make_inspector()
print("Available variable importances:")
for importance in inspector.variable_importances().keys():
print("\t", importance)
# Preparing CSV for submission
# Loading the test dataset
test_df = pd.read_csv(os.path.join(input_dir, 'test.csv'))
submission_id = test_df.PassengerId
# Handling missing values and creating new features as done with training dataset
test_df[['VIP', 'CryoSleep']] = test_df[['VIP', 'CryoSleep']].fillna(value=0)
test_df[["Deck", "Cabin_num", "Side"]] = test_df["Cabin"].str.split("/", expand=True)
test_df = test_df.drop('Cabin', axis=1)
test_df['VIP'] = test_df['VIP'].astype(int)
test_df['CryoSleep'] = test_df['CryoSleep'].astype(int)
# Converting test Pandas DataFrame to TensorFlow Datasets format
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df)
# Getting predictions for test dataset
predictions = Randforest_trdf.predict(test_ds)
n_predictions = (predictions > 0.5).astype(bool)
output = pd.DataFrame({'PassengerId': submission_id,
'Transported': n_predictions.squeeze()})
output.head()
# Creating a sample submission file
sample_submission_df = pd.read_csv(os.path.join(input_dir, 'sample_submission.csv'))
sample_submission_df['Transported'] = n_predictions
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.head()