import pandas as pd
import os
# change your path
data_path = r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project'
test_data = pd.read_csv(r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\test.csv')
submission_file_path = r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\submission.csv'
zone_feature_files_paths = {
"Chukyo": os.path.join(data_path, "Zone_features", "Chukyo_zone_feature_area.csv"),
"Higashisurugawan": os.path.join(data_path, "Zone_features", "Higashisurugawan_zone_feature_area.csv"),
"Kinki": os.path.join(data_path, "Zone_features", "Kinki_zone_feature_area.csv"),
"Kyushu": os.path.join(data_path, "Zone_features", "Kyushu_zone_feature_area.csv"),
"Tokyo": os.path.join(data_path, "Zone_features", "Tokyo_zone_feature_area.csv")
}
train_files_paths = {
"Chukyo": os.path.join(data_path, "train", "Chukyo.csv"),
"Higashisurugawan": os.path.join(data_path, "train", "Higashisurugawan.csv"),
"Kyushu": os.path.join(data_path, "train", "Kyushu.csv"),
"Tokyo": os.path.join(data_path, "train", "Tokyo.csv")
}
# Load the data
zone_feature_files = {name: pd.read_csv(path) for name, path in zone_feature_files_paths.items()}
train_files = {name: pd.read_csv(path) for name, path in train_files_paths.items()}
# Missing values
def check_missing_values(dataframes):
missing_values = {}
for name, df in dataframes.items():
missing = df.isnull().sum()
missing_values[name] = missing[missing > 0]
return missing_values
missing_zone_features = check_missing_values(zone_feature_files)
missing_train_data = check_missing_values(train_files)
missing_zone_features, missing_train_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
def merge_zone_features(main_df, zone_df):
return main_df.merge(zone_df, left_on='Origin', right_on='ZONE_ID', how='left')
# Load and merge datasets
merged_data = {
"Chukyo": merge_zone_features(train_files['Chukyo'], zone_feature_files['Chukyo']),
"Higashisurugawan": merge_zone_features(train_files['Higashisurugawan'], zone_feature_files['Higashisurugawan']),
"Kyushu": merge_zone_features(train_files['Kyushu'], zone_feature_files['Kyushu']),
"Tokyo": merge_zone_features(train_files['Tokyo'], zone_feature_files['Tokyo'])
}
combined_data = pd.concat(merged_data.values(), axis=0)
# Top 100 most frequent destinations
top_destinations = combined_data['Destination'].value_counts().head(100).index
combined_data = combined_data[combined_data['Destination'].isin(top_destinations)]
# Impute missing values using the mean strategy for numerical columns
imputer = SimpleImputer(strategy='mean')
combined_data[['T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']] = imputer.fit_transform(
combined_data[['T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']]
)
# Encode categorical variables
label_encoders = {}
for col in ['Gender', 'Occupation', 'Trip_type', 'Origin']:
le = LabelEncoder()
combined_data[col] = le.fit_transform(combined_data[col])
label_encoders[col] = le
for col in ['Gender', 'Occupation', 'Trip_type']:
# Get the labels seen during training
known_labels = set(label_encoders[col].classes_)
# Replace unseen labels with the most frequent label or a default value (e.g., -1 or 'unknown')
test_data[col] = test_data[col].apply(lambda x: x if x in known_labels else np.nan)
# Fill the NaN (unseen labels) with a default value (e.g., most frequent label seen during training)
test_data[col].fillna(label_encoders[col].classes_[0], inplace=True) # Replace with most frequent label
# Apply the label encoding
test_data[col] = label_encoders[col].transform(test_data[col])
# Encode the target variable 'Destination'
le_destination = LabelEncoder()
y = le_destination.fit_transform(combined_data['Destination'])
# Select features
features = ['Gender', 'Age', 'Occupation', 'Trip_type', 'T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']
X = combined_data[features]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(combined_data['Destination'].nunique()) # Check number of unique values in 'Destination'
print(combined_data['Destination'].value_counts().head(10)) # Inspect top 10 frequent classes
# split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=64)
# Train XGBoost with GPU
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', random_state=64)
xgb.fit(X_train, y_train)
# Predict and evaluate
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
#skip this this is knn model
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X_scaled, y, test_size=0.2, random_state=48)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test_knn)
accuracy = accuracy_score(y_test_knn, y_pred_knn)
conf_matrix = confusion_matrix(y_test_knn, y_pred_knn)
class_report = classification_report(y_test_knn, y_pred_knn)
print("Training Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
# SKip this
from sklearn.model_selection import cross_val_score
#XGBoost 5-fold cross-validation
xgb_cv = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', random_state=48)
scores = cross_val_score(xgb_cv, X_train, y_train, cv=5, scoring='accuracy')
print("XGBoost Cross-Validation Accuracy Scores:", scores)
print("Mean Cross-Validation Accuracy:", scores.mean())
import pandas as pd
y_test_pred = xgb.predict(X_test_scaled)
test_data['Destination'] = le_destination.inverse_transform(y_test_pred)
submission = test_data[['Trip_id', 'Destination']]
submission.to_csv(submission_file_path, index=False)
print(f"Submission file saved to {submission_file_path}")
最后修改:2025 年 01 月 09 日
© 允许规范转载
50 条评论
布塔达姆巴斯卡纳拉亚纳
火烧圆明园
惊天追踪
狮子王木法沙传奇
朱莉亚赖克特一步一脚印
ktv火热的女人2
丹尼玫瑰
拳语者
保你平安
王牌
人生大事
天生冤家
渚北谜案
忘记你的方法
卧底费瑞崛起
记忆奇旅
发财日记
如何赢得王子
恋尸谜案
结局终将壮烈
食人猫大报复
下一站说爱你
劫车王
塞伦盖蒂第三季
伴郎初见
济公之英雄归位
我要怎么感动你
设局
珍妮麦卡锡的性感搞笑
优皮干探
米尔斯探秘生态中国
食人俱乐部
降临
哥斯拉
野性法国
如何赢得王子
民间奇异志
雷蒙斯尼奇的不幸历险
扭曲的虔诚
火烧圆明园
梦幻岛
一纸婚约
群演
你的才华让人惊叹,请继续保持。 https://www.4006400989.com/qyvideo/78805.html
你的文章让我感受到了快乐,每天都要来看一看。 http://www.55baobei.com/p7DYbi8XV1.html
哈哈哈,写的太好了https://www.cscnn.com/
想想你的文章写的特别好www.jiwenlaw.com
想想你的文章写的特别好https://www.ea55.com/
想想你的文章写的特别好https://www.237fa.com/
怎么收藏这篇文章?