1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
| tdf = pd.read_csv('test.csv')
missing_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for feature in missing_features: tdf[feature].fillna(tdf[feature].mean(), inplace=True)
categorical_features = ['HomePlanet', 'CryoSleep', 'Cabin','Destination', 'VIP']
for feature in categorical_features: tdf[feature].fillna(tdf[feature].mode()[0], inplace=True)
tdf.drop(columns=['PassengerId', 'Name'], inplace=True)
tdf[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = tdf['Cabin'].str.split('/', expand=True)
tdf.drop(columns=['Cabin'], inplace=True)
features = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'] for col in features: tdf[col] = np.log1p(tdf[col])
Q1 = tdf[col].quantile(0.25) Q3 = tdf[col].quantile(0.75) IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR upper = Q3 + 1.5 * IQR
tdf[col] = tdf[col].clip(lower=lower, upper=upper)
Q1 = df['Age'].quantile(0.25) Q3 = df['Age'].quantile(0.75) IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR upper = Q3 + 1.5 * IQR
tdf['Age'] = tdf['Age'].clip(lower=lower, upper=upper)
from sklearn.preprocessing import OneHotEncoder
one_hot_features = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP', 'Cabin_deck', 'Cabin_side']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_array = ohe.fit_transform(tdf[one_hot_features]) encoded_cols = ohe.get_feature_names_out(one_hot_features)
encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols, index=tdf.index)
tdf_cleaned_encoded = pd.concat([tdf.drop(columns=one_hot_features), encoded_df], axis=1)
tdf_cleaned_encoded.drop(columns=['Cabin_num'], inplace=True)
tdf_cleaned_encoded.drop(columns=[ 'Cabin_deck_A' , 'Destination_PSO J318.5-22' , 'VIP_False' , 'Cabin_deck_T' , 'Cabin_deck_G' , 'HomePlanet_Mars' , 'Cabin_deck_D' , 'VIP_True' , 'Destination_TRAPPIST-1e' , 'Cabin_side_S' , 'Cabin_side_P' , 'Cabin_deck_F' , 'Cabin_deck_E' , 'Destination_55 Cancri e' , 'Cabin_deck_C'], inplace=True)
X_test_scaled = scaler.transform(tdf_cleaned_encoded)
model = lgb.LGBMClassifier() model.fit(X_train_scaled, y_train)
y_pred = model.predict(tdf_cleaned_encoded)
tdf = pd.read_csv('test.csv') tdf['pred'] = y_pred
result = pd.DataFrame({ 'iPassengerId': tdf['PassengerId'], 'Transported': tdf['pred'] })
result.to_csv("submission.csv", index=False)
|