机器学习算法讲堂(一) 十分钟入门机器学习算法竞赛

时间:2022-05-23 17:08:03

机器学习算法讲堂(一) 十分钟入门机器学习算法竞赛

比赛地址:https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

import pandas as pd
import numpy as np 
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

file = pd.read_csv('./data/train.csv', nrows = 1000000)

print(file.head())
print(file.shape)

file = file.dropna(how = 'any', axis = 'rows')
#Clean dataset
def clean_df(df):
    return df[(df.fare_amount > 0) & 
           # (df.pickup_longitude > -80) & (df.pickup_longitude < -70) &
           # (df.pickup_latitude > 35) & (df.pickup_latitude < 45) &
           # (df.dropoff_longitude > -80) & (df.dropoff_longitude < -70) &
           # (df.dropoff_latitude > 35) & (df.dropoff_latitude < 45) &
            (df.passenger_count > 0) & (df.passenger_count < 10)]
file = clean_df(file)
print(len(file))
print(file.shape)

def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
"""
Return distance along great radius between pickup and dropoff coordinates.
"""
#Define earth radius (km)
R_earth = 6371
#Convert degrees to radians
pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                         [pickup_lat, pickup_lon, 
                                                          dropoff_lat, dropoff_lon])
#Compute distances along lat, lon dimensions
dlat = dropoff_lat - pickup_lat
dlon = dropoff_lon - pickup_lon

#Compute haversine distance
a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2

return 2 * R_earth * np.arcsin(np.sqrt(a))

def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    Washington_Square = (40.4351,-73.5951)
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) 
    pickup_square = sphere_dist(pickup_lat, pickup_lon, Washington_Square[0], Washington_Square[1]) 
    dropoff_square = sphere_dist(Washington_Square[0], Washington_Square[1], dropoff_lat, dropoff_lon) 
    
    dataset['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
    dataset['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
    dataset['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)
    dataset['washington_dist'] = pd.concat([pickup_square, dropoff_square], axis=1).min(axis=1)
    
    dataset['longitude_distance'] = abs(dataset['pickup_longitude'] - dataset['dropoff_longitude'])
    dataset['latitude_distance'] = abs(dataset['pickup_latitude'] - dataset['dropoff_latitude'])

    # Straight distance
    dataset['distance_travelled'] = (dataset['longitude_distance'] ** 2 + dataset['latitude_distance'] ** 2) ** .5
    dataset['distance_travelled_sin'] = np.sin((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5)
    dataset['distance_travelled_cos'] = np.cos((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5)
    dataset['distance_travelled_sin_sqrd'] = np.sin((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) ** 2
    dataset['distance_travelled_cos_sqrd'] = np.cos((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) ** 2
    
    # dataset["fare_to_dist_ratio"] = dataset["fare_amount"] / ( dataset["distance_travelled"]+0.0001)
    # dataset["fare_npassenger_to_dist_ratio"] = (dataset["fare_amount"] / dataset["passenger_count"]) /( dataset["distance_travelled"]+0.0001)

    dataset['jfk'] = 0
    dataset.loc[(dataset['pickup_longitude'] >= -73.7841) & (dataset['pickup_longitude'] <= -73.7721) &
       (dataset['pickup_latitude'] <= 40.6613) & (dataset['pickup_latitude'] >= 40.6213),'jfk'] = 1
    dataset.loc[(dataset['dropoff_longitude'] >= -73.7841) & (dataset['dropoff_longitude'] <= -73.7721) &
       (dataset['dropoff_latitude'] <= 40.6613) & (dataset['dropoff_latitude'] >= 40.6213),'jfk'] = 1

    dataset['lga'] = 0
    dataset.loc[(dataset['pickup_longitude'] >= -73.8870) & (dataset['pickup_longitude'] <= -73.8580) &
       (dataset['pickup_latitude'] <= 40.7800) & (dataset['pickup_latitude'] >= 40.7680),'lga'] = 1
    dataset.loc[(dataset['dropoff_longitude'] >= -73.8870) & (dataset['dropoff_longitude'] <= -73.8580) &
       (dataset['dropoff_latitude'] <= 40.7800) & (dataset['dropoff_latitude'] >= 40.7680),'lga'] = 1

    dataset['ewr'] = 0
    dataset.loc[(dataset['pickup_longitude'] >= -74.192) & (dataset['pickup_longitude'] <= -74.172) &
       (dataset['pickup_latitude'] <= 40.708) & (dataset['pickup_latitude'] >= 40.676),'ewr'] = 1
    dataset.loc[(dataset['dropoff_longitude'] >= -74.192) & (dataset['dropoff_longitude'] <= -74.172) &
       (dataset['dropoff_latitude'] <= 40.708) & (dataset['dropoff_latitude'] >= 40.676),'ewr'] = 1

    return dataset

def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    # dataset['second'] = dataset.pickup_datetime.dt.second
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    # dataset['all_time'] = dataset['second'] + 60*dataset['hour'] + 24*60*dataset['day']+30*24*60*dataset['month']
    
    return dataset

file = add_datetime_info(file)
file = add_airport_dist(file)
file = file.drop(columns=['pickup_datetime']) #'distance_travelled_sin_sqrd','passenger_count','distance_travelled_cos_sqrd'])
file['distance'] = sphere_dist(file['pickup_latitude'], file['pickup_longitude'], 
                                   file['dropoff_latitude'] , file['dropoff_longitude'])

file.head()

test_file = pd.read_csv('./data/test.csv')
test_file = add_datetime_info(test_file)
test_file = add_airport_dist(test_file)
test_file = test_file.drop(columns=['pickup_datetime']) #, 'distance_travelled_sin_sqrd','passenger_count','distance_travelled_cos_sqrd'])

test_file['distance'] = sphere_dist(test_file['pickup_latitude'], test_file['pickup_longitude'], 
                                   test_file['dropoff_latitude'] , test_file['dropoff_longitude'])


test_file.head()


import datetime as dt
from sklearn.model_selection import train_test_split
import xgboost as xgb
import os


train_x = file.drop(columns=['fare_amount'])
y = file['fare_amount']
new_test = test_file

from sklearn.preprocessing import LabelEncoder
for c in train_x.columns:
    if train_x[c].dtype == 'datetime64[ns]' or train_x[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train_x[c].values) + list(test_file[c].values))
        train_x[c] = lbl.transform(list(train_x[c].values))
        test_file[c] = lbl.transform(list(test_file[c].values))
print(test_file.head())
x_train,x_test,y_train,y_test = train_test_split(train_x,y,random_state=0,test_size=0.01)

'''

for x in range(0,len(x_train['pickup_datetime'])):
    try:
        time = ''
        for time_ac in str(x_train['pickup_datetime'].loc[x]):
            if time_ac <= '9' and time_ac >= '0':
                time = time + time_ac
        x_train['pickup_datetime'].loc[x] = time
    except:
        x_train['pickup_datetime'].loc[x] = 0
x_train['pickup_datetime'].astype('int64')
'''
print(x_train.dtypes)
print(x_train.head)

'''

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
eta = 0.1
max_depth = 8
subsample = 0.8
colsample_bytree = 0.8

print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
    "objective": "reg:linear",
    "booster" : "gbtree",
    "eval_metric": "rmse",
    "eta": eta,
    "max_depth": max_depth,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "silent": 1,
    "seed": 19960429
}

watchlist  = [(dtrain,'train'),(dtest,'val')]
num_round = 3000
early_stopping_rounds=50
bst = xgb.train(params, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)
'''
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import matplotlib.pylab as plt

# Keep Relevant Variables..
trainshape = train_x.shape
testshape = test_file.shape

# print("\nTrain DF..")
# train = reduce_mem_usage(train)
# print("\nTest DF..")
# test_df = reduce_mem_usage(test_df)

# LGBM Dataset Formating
dtrain = lgb.Dataset(train_x, label=y, free_raw_data=False)

print("Light Gradient Boosting Regressor: ")
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth':7,
    'learning_rate':.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

folds = KFold(n_splits=5, shuffle=True, random_state=1)
fold_preds = np.zeros(testshape[0])
oof_preds = np.zeros(trainshape[0])
dtrain.construct()

# Fit 5 Folds
modelstart = time.time()
for trn_idx, val_idx in folds.split(file):
    clf = lgb.train(
        params=lgbm_params,
        train_set=dtrain.subset(trn_idx),
        valid_sets=dtrain.subset(val_idx),
        num_boost_round=17000, 
        early_stopping_rounds=250,
        verbose_eval=500
    )
    oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
    fold_preds += clf.predict(test_file) / folds.n_splits
    print(mean_squared_error(y.iloc[val_idx], oof_preds[val_idx]) ** .5)
    # lgb.plot_importance(clf, max_num_features=30)
    
print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))


import time
Ttest = xgb.DMatrix(test_file)
# ypred = bst.predict(Ttest)
ypred = fold_preds
new_test = pd.read_csv('./data/test.csv')
output = pd.DataFrame({ 'key' : new_test['key'], 'fare_amount': ypred })
print(output.head())
dt = time.strftime('%Y%m%d%H%M%S',time.localtime())
output.to_csv('.//data//ans'+str(dt)+'.csv', index = False)