SYS5405F_A1

博主： William.W
发布时间：2024 年 09 月 26 日
605 次浏览
50 条评论
6644字数
分类：代码-Code

import pandas as pd
import os

# change your path
data_path = r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project'
test_data = pd.read_csv(r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\test.csv')
submission_file_path = r'C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\submission.csv'

zone_feature_files_paths = {
    "Chukyo": os.path.join(data_path, "Zone_features", "Chukyo_zone_feature_area.csv"),
    "Higashisurugawan": os.path.join(data_path, "Zone_features", "Higashisurugawan_zone_feature_area.csv"),
    "Kinki": os.path.join(data_path, "Zone_features", "Kinki_zone_feature_area.csv"),
    "Kyushu": os.path.join(data_path, "Zone_features", "Kyushu_zone_feature_area.csv"),
    "Tokyo": os.path.join(data_path, "Zone_features", "Tokyo_zone_feature_area.csv")
}

train_files_paths = {
    "Chukyo": os.path.join(data_path, "train", "Chukyo.csv"),
    "Higashisurugawan": os.path.join(data_path, "train", "Higashisurugawan.csv"),
    "Kyushu": os.path.join(data_path, "train", "Kyushu.csv"),
    "Tokyo": os.path.join(data_path, "train", "Tokyo.csv")
}

# Load the data
zone_feature_files = {name: pd.read_csv(path) for name, path in zone_feature_files_paths.items()}
train_files = {name: pd.read_csv(path) for name, path in train_files_paths.items()}

# Missing values
def check_missing_values(dataframes):
    missing_values = {}
    for name, df in dataframes.items():
        missing = df.isnull().sum()
        missing_values[name] = missing[missing > 0]
    return missing_values

missing_zone_features = check_missing_values(zone_feature_files)
missing_train_data = check_missing_values(train_files)
missing_zone_features, missing_train_data

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

def merge_zone_features(main_df, zone_df):
    return main_df.merge(zone_df, left_on='Origin', right_on='ZONE_ID', how='left')

# Load and merge datasets
merged_data = {
    "Chukyo": merge_zone_features(train_files['Chukyo'], zone_feature_files['Chukyo']),
    "Higashisurugawan": merge_zone_features(train_files['Higashisurugawan'], zone_feature_files['Higashisurugawan']),
    "Kyushu": merge_zone_features(train_files['Kyushu'], zone_feature_files['Kyushu']),
    "Tokyo": merge_zone_features(train_files['Tokyo'], zone_feature_files['Tokyo'])
}

combined_data = pd.concat(merged_data.values(), axis=0)

# Top 100 most frequent destinations
top_destinations = combined_data['Destination'].value_counts().head(100).index
combined_data = combined_data[combined_data['Destination'].isin(top_destinations)]

# Impute missing values using the mean strategy for numerical columns
imputer = SimpleImputer(strategy='mean')
combined_data[['T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']] = imputer.fit_transform(
    combined_data[['T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']]
)
# Encode categorical variables
label_encoders = {}
for col in ['Gender', 'Occupation', 'Trip_type', 'Origin']: 
    le = LabelEncoder()
    combined_data[col] = le.fit_transform(combined_data[col])
    label_encoders[col] = le

for col in ['Gender', 'Occupation', 'Trip_type']:
    # Get the labels seen during training
    known_labels = set(label_encoders[col].classes_)
    
    # Replace unseen labels with the most frequent label or a default value (e.g., -1 or 'unknown')
    test_data[col] = test_data[col].apply(lambda x: x if x in known_labels else np.nan)
    
    # Fill the NaN (unseen labels) with a default value (e.g., most frequent label seen during training)
    test_data[col].fillna(label_encoders[col].classes_[0], inplace=True)  # Replace with most frequent label
    
    # Apply the label encoding
    test_data[col] = label_encoders[col].transform(test_data[col])

# Encode the target variable 'Destination'
le_destination = LabelEncoder()
y = le_destination.fit_transform(combined_data['Destination'])

# Select features
features = ['Gender', 'Age', 'Occupation', 'Trip_type', 'T000918002', 'T000918006', 'T000918021', 'T000918025', 'T000847001']
X = combined_data[features]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(combined_data['Destination'].nunique())  # Check number of unique values in 'Destination'
print(combined_data['Destination'].value_counts().head(10))  # Inspect top 10 frequent classes

# split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=64)

# Train XGBoost with GPU
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', random_state=64)
xgb.fit(X_train, y_train)
# Predict and evaluate
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
#skip this this is knn model

X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X_scaled, y, test_size=0.2, random_state=48)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test_knn)
accuracy = accuracy_score(y_test_knn, y_pred_knn)
conf_matrix = confusion_matrix(y_test_knn, y_pred_knn)
class_report = classification_report(y_test_knn, y_pred_knn)

print("Training Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
# SKip this

from sklearn.model_selection import cross_val_score

#XGBoost 5-fold cross-validation
xgb_cv = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', random_state=48)
scores = cross_val_score(xgb_cv, X_train, y_train, cv=5, scoring='accuracy')

print("XGBoost Cross-Validation Accuracy Scores:", scores)
print("Mean Cross-Validation Accuracy:", scores.mean())

import pandas as pd
y_test_pred = xgb.predict(X_test_scaled)

test_data['Destination'] = le_destination.inverse_transform(y_test_pred)

submission = test_data[['Trip_id', 'Destination']]
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved to {submission_file_path}")

最后修改：2025 年 01 月 09 日

如果觉得我的文章对你有用，请随意赞赏

50 条评论

mltpwjjxyy
May 23rd, 2025 at 03:16 pm

布塔达姆巴斯卡纳拉亚纳

回复
pbnbiccwkr
May 23rd, 2025 at 03:14 pm

火烧圆明园

回复
fnloenlhec
May 23rd, 2025 at 03:13 pm

惊天追踪

回复
olwfmtuicc
May 21st, 2025 at 06:45 am

狮子王木法沙传奇

回复
unkfujqcqy
May 21st, 2025 at 06:44 am

朱莉亚赖克特一步一脚印

回复
nuremgveem
May 21st, 2025 at 06:43 am

ktv火热的女人2

回复
kirlrtfmoi
May 18th, 2025 at 08:03 am

丹尼玫瑰

回复
ztdltbbdws
May 18th, 2025 at 08:01 am

拳语者

回复
qnxsnojlau
May 18th, 2025 at 08:00 am

保你平安

回复
sxqkvctamk
May 15th, 2025 at 04:32 pm

王牌

回复
rghfmhroge
May 15th, 2025 at 04:31 pm

人生大事

回复
hhucqurmme
May 15th, 2025 at 04:29 pm

天生冤家

回复
hcpghpxvuy
May 15th, 2025 at 04:28 pm

渚北谜案

回复
givnsbasrb
May 15th, 2025 at 04:27 pm

忘记你的方法

回复
kdhstuguin
May 15th, 2025 at 04:26 pm

卧底费瑞崛起

回复
atlqxsydri
May 15th, 2025 at 04:05 pm

记忆奇旅

回复
dieznskbwu
May 15th, 2025 at 04:04 pm

发财日记

回复
vmocstxocw
May 15th, 2025 at 04:02 pm

如何赢得王子

回复
izhaeszdec
May 15th, 2025 at 04:00 pm

恋尸谜案

回复
vqvftvipdh
May 15th, 2025 at 02:55 pm

结局终将壮烈

回复
dhjblmdkqe
May 15th, 2025 at 02:54 pm

食人猫大报复

回复
iojvgwsyyn
May 15th, 2025 at 02:52 pm

下一站说爱你

回复
hmyfmxsznl
May 15th, 2025 at 02:51 pm

劫车王

回复
fplptxcath
May 15th, 2025 at 02:49 pm

塞伦盖蒂第三季

回复
ialthwjdem
May 15th, 2025 at 04:25 am

伴郎初见

回复
asnoqxlymt
May 15th, 2025 at 04:23 am

济公之英雄归位

回复
mrbspddklm
May 15th, 2025 at 04:21 am

我要怎么感动你

回复
aonumgnrpg
May 12th, 2025 at 07:29 am

设局

回复
wioigbfjho
May 12th, 2025 at 07:27 am

珍妮麦卡锡的性感搞笑

回复
alusmlsgld
May 12th, 2025 at 07:25 am

优皮干探

回复
imtytuotyp
May 12th, 2025 at 07:22 am

米尔斯探秘生态中国

回复
ytsqrigguc
May 9th, 2025 at 09:22 pm

食人俱乐部

回复
mkwguywjmh
May 9th, 2025 at 09:20 pm

降临

回复
rlimaysqem
May 9th, 2025 at 09:19 pm

哥斯拉

回复
onjchovmkb
May 9th, 2025 at 09:18 pm

野性法国

回复
ioklgsmksd
May 9th, 2025 at 08:53 pm

如何赢得王子

回复
xearogabma
May 9th, 2025 at 08:50 pm

民间奇异志

回复
fiotfuyysa
May 9th, 2025 at 08:48 pm

雷蒙斯尼奇的不幸历险

回复
pyakldqhml
May 4th, 2025 at 05:15 am

扭曲的虔诚

回复
nzsqymuyht
May 4th, 2025 at 05:13 am

火烧圆明园

回复
ipptidimqy
May 4th, 2025 at 05:12 am

梦幻岛

回复
fvlvjxnhzc
May 4th, 2025 at 05:11 am

一纸婚约

回复
styfaeopxc
May 4th, 2025 at 05:08 am

群演

回复
oputuhgttf
December 3rd, 2024 at 07:17 am

你的才华让人惊叹，请继续保持。 https://www.4006400989.com/qyvideo/78805.html

回复
ymssfesers
November 25th, 2024 at 06:58 am

你的文章让我感受到了快乐，每天都要来看一看。 http://www.55baobei.com/p7DYbi8XV1.html

回复
qsotjigain
October 19th, 2024 at 01:21 am

哈哈哈，写的太好了https://www.cscnn.com/

回复
riicbbymkb
October 6th, 2024 at 06:31 am

想想你的文章写的特别好www.jiwenlaw.com

回复
xrdaxjawqz
October 4th, 2024 at 08:37 am

想想你的文章写的特别好https://www.ea55.com/

回复
repawfawla
October 1st, 2024 at 07:34 am

想想你的文章写的特别好https://www.237fa.com/

回复
ptwwzbsstv
September 26th, 2024 at 11:12 pm

怎么收藏这篇文章？

回复

发表评论取消回复
使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款

评论 *

私密评论

名称 *

🎲

邮箱 *

地址

SYS5405F_A1

William.W • 2024 年 09 月 26 日

<pre><code class="lang-python">import pandas as pd
import os

# change your path
data_path = r&#039;C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project&#039;
test_data = pd.read_csv(r&#039;C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\test.csv&#039;)
submission_file_path = r&#039;C:\Users\rtgtx\Desktop\SYSC5703\Destination_Prediction_Project\submission.csv&#039;

zone_feature_files_paths = {
    &quot;Chukyo&quot;: os.path.join(data_path, &quot;Zone_features&quot;, &quot;Chukyo_zone_feature_area.csv&quot;),
    &quot;Higashisurugawan&quot;: os.path.join(data_path, &quot;Zone_features&quot;, &quot;Higashisurugawan_zone_feature_area.csv&quot;),
    &quot;Kinki&quot;: os.path.join(data_path, &quot;Zone_features&quot;, &quot;Kinki_zone_feature_area.csv&quot;),
    &quot;Kyushu&quot;: os.path.join(data_path, &quot;Zone_features&quot;, &quot;Kyushu_zone_feature_area.csv&quot;),
    &quot;Tokyo&quot;: os.path.join(data_path, &quot;Zone_features&quot;, &quot;Tokyo_zone_feature_area.csv&quot;)
}

train_files_paths = {
    &quot;Chukyo&quot;: os.path.join(data_path, &quot;train&quot;, &quot;Chukyo.csv&quot;),
    &quot;Higashisurugawan&quot;: os.path.join(data_path, &quot;train&quot;, &quot;Higashisurugawan.csv&quot;),
    &quot;Kyushu&quot;: os.path.join(data_path, &quot;train&quot;, &quot;Kyushu.csv&quot;),
    &quot;Tokyo&quot;: os.path.join(data_path, &quot;train&quot;, &quot;Tokyo.csv&quot;)
}

# Load the data
zone_feature_files = {name: pd.read_csv(path) for name, path in zone_feature_files_paths.items()}
train_files = {name: pd.read_csv(path) for name, path in train_files_paths.items()}

# Missing values
def check_missing_values(dataframes):
    missing_values = {}
    for name, df in dataframes.items():
        missing = df.isnull().sum()
        missing_values[name] = missing[missing &gt; 0]
    return missing_values

missing_zone_features = check_missing_values(zone_feature_files)
missing_train_data = check_missing_values(train_files)
missing_zone_features, missing_train_data

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

def merge_zone_features(main_df, zone_df):
    return main_df.merge(zone_df, left_on=&#039;Origin&#039;, right_on=&#039;ZONE_ID&#039;, how=&#039;left&#039;)

# Load and merge datasets
merged_data = {
    &quot;Chukyo&quot;: merge_zone_features(train_files[&#039;Chukyo&#039;], zone_feature_files[&#039;Chukyo&#039;]),
    &quot;Higashisurugawan&quot;: merge_zone_features(train_files[&#039;Higashisurugawan&#039;], zone_feature_files[&#039;Higashisurugawan&#039;]),
    &quot;Kyushu&quot;: merge_zone_features(train_files[&#039;Kyushu&#039;], zone_feature_files[&#039;Kyushu&#039;]),
    &quot;Tokyo&quot;: merge_zone_features(train_files[&#039;Tokyo&#039;], zone_feature_files[&#039;Tokyo&#039;])
}

combined_data = pd.concat(merged_data.values(), axis=0)

# Top 100 most frequent destinations
top_destinations = combined_data[&#039;Destination&#039;].value_counts().head(100).index
combined_data = combined_data[combined_data[&#039;Destination&#039;].isin(top_destinations)]

# Impute missing values using the mean strategy for numerical columns
imputer = SimpleImputer(strategy=&#039;mean&#039;)
combined_data[[&#039;T000918002&#039;, &#039;T000918006&#039;, &#039;T000918021&#039;, &#039;T000918025&#039;, &#039;T000847001&#039;]] = imputer.fit_transform(
    combined_data[[&#039;T000918002&#039;, &#039;T000918006&#039;, &#039;T000918021&#039;, &#039;T000918025&#039;, &#039;T000847001&#039;]]
)
# Encode categorical variables
label_encoders = {}
for col in [&#039;Gender&#039;, &#039;Occupation&#039;, &#039;Trip_type&#039;, &#039;Origin&#039;]: 
    le = LabelEncoder()
    combined_data[col] = le.fit_transform(combined_data[col])
    label_encoders[col] = le

for col in [&#039;Gender&#039;, &#039;Occupation&#039;, &#039;Trip_type&#039;]:
    # Get the labels seen during training
    known_labels = set(label_encoders[col].classes_)
    
    # Replace unseen labels with the most frequent label or a default value (e.g., -1 or &#039;unknown&#039;)
    test_data[col] = test_data[col].apply(lambda x: x if x in known_labels else np.nan)
    
    # Fill the NaN (unseen labels) with a default value (e.g., most frequent label seen during training)
    test_data[col].fillna(label_encoders[col].classes_[0], inplace=True)  # Replace with most frequent label
    
    # Apply the label encoding
    test_data[col] = label_encoders[col].transform(test_data[col])

# Encode the target variable &#039;Destination&#039;
le_destination = LabelEncoder()
y = le_destination.fit_transform(combined_data[&#039;Destination&#039;])

# Select features
features = [&#039;Gender&#039;, &#039;Age&#039;, &#039;Occupation&#039;, &#039;Trip_type&#039;, &#039;T000918002&#039;, &#039;T000918006&#039;, &#039;T000918021&#039;, &#039;T000918025&#039;, &#039;T000847001&#039;]
X = combined_data[features]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(combined_data[&#039;Destination&#039;].nunique())  # Check number of unique values in &#039;Destination&#039;
print(combined_data[&#039;Destination&#039;].value_counts().head(10))  # Inspect top 10 frequent classes

# split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=64)

# Train XGBoost with GPU
xgb = XGBClassifier(use_label_encoder=False, eval_metric=&#039;mlogloss&#039;, tree_method=&#039;gpu_hist&#039;, random_state=64)
xgb.fit(X_train, y_train)
# Predict and evaluate
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(&quot;Accuracy:&quot;, accuracy)
print(&quot;Confusion Matrix:\n&quot;, conf_matrix)
print(&quot;Classification Report:\n&quot;, class_report)
#skip this this is knn model

X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X_scaled, y, test_size=0.2, random_state=48)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test_knn)
accuracy = accuracy_score(y_test_knn, y_pred_knn)
conf_matrix = confusion_matrix(y_test_knn, y_pred_knn)
class_report = classification_report(y_test_knn, y_pred_knn)

print(&quot;Training Accuracy:&quot;, accuracy)
print(&quot;Confusion Matrix:\n&quot;, conf_matrix)
print(&quot;Classification Report:\n&quot;, class_report)
# SKip this

from sklearn.model_selection import cross_val_score

#XGBoost 5-fold cross-validation
xgb_cv = XGBClassifier(use_label_encoder=False, eval_metric=&#039;mlogloss&#039;, tree_method=&#039;gpu_hist&#039;, random_state=48)
scores = cross_val_score(xgb_cv, X_train, y_train, cv=5, scoring=&#039;accuracy&#039;)

print(&quot;XGBoost Cross-Validation Accuracy Scores:&quot;, scores)
print(&quot;Mean Cross-Validation Accuracy:&quot;, scores.mean())

import pandas as pd
y_test_pred = xgb.predict(X_test_scaled)

test_data[&#039;Destination&#039;] = le_destination.inverse_transform(y_test_pred)

submission = test_data[[&#039;Trip_id&#039;, &#039;Destination&#039;]]
submission.to_csv(submission_file_path, index=False)

print(f&quot;Submission file saved to {submission_file_path}&quot;)</code></pre>

50 条评论

发表评论 取消回复 使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款

SYS5405F_A1

发表评论取消回复
使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款